Merge pull request #8556 from google/benvanik-buffer-usage
Cleaning up HAL buffer transfer usage.
diff --git a/build_tools/bazel/build_core.sh b/build_tools/bazel/build_core.sh
index 15c31c7..1e63328 100755
--- a/build_tools/bazel/build_core.sh
+++ b/build_tools/bazel/build_core.sh
@@ -13,7 +13,7 @@
# Looks at environment variables and uses CI-friendly defaults if they are not
# set.
# IREE_LLVMAOT_DISABLE: Do not run tests that require LLVM-AOT. Default: 0
-# IREE_VULKAN_DISABLE: Do not run tests that require Vulkan. Default: 1
+# IREE_VULKAN_DISABLE: Do not run tests that require Vulkan. Default: 0
# BUILD_TAG_FILTERS: Passed to bazel to filter targets to build.
# See https://docs.bazel.build/versions/master/command-line-reference.html#flag--build_tag_filters)
# Default: "-nokokoro"
diff --git a/build_tools/bazel_to_cmake/bazel_to_cmake_targets.py b/build_tools/bazel_to_cmake/bazel_to_cmake_targets.py
index 63c9133..aad06d4 100644
--- a/build_tools/bazel_to_cmake/bazel_to_cmake_targets.py
+++ b/build_tools/bazel_to_cmake/bazel_to_cmake_targets.py
@@ -97,6 +97,22 @@
"MhloDialect",
"MLIRMhloUtils",
],
+ "@mlir-hlo//:hlo_legalize_shape_ops_to_standard": [
+ "tensorflow::external_mhlo_includes",
+ "MhloShapeOpsToStandard",
+ ],
+ "@mlir-hlo//:hlo_legalize_to_arithmetic": [
+ "tensorflow::external_mhlo_includes",
+ "MhloToArithmeticConversion",
+ ],
+ "@mlir-hlo//:hlo_legalize_to_lhlo": [
+ "tensorflow::external_mhlo_includes",
+ "MhloToLhloConversion",
+ ],
+ "@mlir-hlo//:hlo_legalize_to_memref": [
+ "tensorflow::external_mhlo_includes",
+ "MhloToMemrefConversion",
+ ],
"@mlir-hlo//:legalize_control_flow": [
"tensorflow::external_mhlo_includes",
"MhloToStandard",
diff --git a/build_tools/buildkite/cmake/android/arm64-v8a/pipeline.yml b/build_tools/buildkite/cmake/android/arm64-v8a/pipeline.yml
index bc59ae6..2d13d15 100644
--- a/build_tools/buildkite/cmake/android/arm64-v8a/pipeline.yml
+++ b/build_tools/buildkite/cmake/android/arm64-v8a/pipeline.yml
@@ -40,9 +40,9 @@
- "tar xzf build-artifacts.tgz"
- "find build-android/ -name '*.cmake' -exec sed -i \"s!\\$IREE_DOCKER_WORKDIR/!\\$PWD/!g\" {} \\;"
- "cd build-android/"
- # vulkan tests using khr_shader_float16_int8 are failing on pixel4.
- # Disabling it until we identify the root cause.
- - "ctest --timeout 900 --output-on-failure --label-exclude \"^vulkan_uses_vk_khr_shader_float16_int8\\$\""
+ # Pixel 4 ships an old Adreno GPU driver. There are quite a few bugs triggered by our tests.
+ # Disable running tests entirely on Pixel 4. Moto Edge X30 gets us covered on Adreno GPU.
+ - "ctest --timeout 900 --output-on-failure --label-exclude \"vulkan\""
agents:
- "android-soc=snapdragon-855"
- "queue=test-android"
@@ -50,6 +50,21 @@
IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
timeout_in_minutes: "15"
+ - label: "test on Moto Edge X30 (snapdragon-8gen1, adreno-730)"
+ commands:
+ - "git clean -fdx"
+ - "buildkite-agent artifact download --step build build-artifacts.tgz ./"
+ - "tar xzf build-artifacts.tgz"
+ - "find build-android/ -name '*.cmake' -exec sed -i \"s!\\$IREE_DOCKER_WORKDIR/!\\$PWD/!g\" {} \\;"
+ - "cd build-android/"
+ - "ctest --timeout 900 --output-on-failure"
+ agents:
+ - "android-soc=snapdragon-8gen1"
+ - "queue=test-android"
+ env:
+ IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
+ timeout_in_minutes: "15"
+
notify:
- email: "bdi-build-cop+buildkite@grotations.appspotmail.com"
if: build.state == "failed"
diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build.sh
index 1e0826c..0577ada 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build.sh
@@ -52,8 +52,10 @@
# Respect the user setting, but default to as many jobs as we have cores.
export CTEST_PARALLEL_LEVEL=${CTEST_PARALLEL_LEVEL:-$(nproc)}
-# Respect the user setting, but default to turning on vulkan and llvmaot.
-export IREE_VULKAN_DISABLE=${IREE_VULKAN_DISABLE:-0}
+# Respect the user setting, but default to turning off the vulkan tests
+# and turning on the llvmaot ones.
+# TODO(#5716): Fix and enable Vulkan tests.
+export IREE_VULKAN_DISABLE=${IREE_VULKAN_DISABLE:-1}
export IREE_LLVMAOT_DISABLE=${IREE_LLVMAOT_DISABLE:-0}
# CUDA is off by default.
export IREE_CUDA_DISABLE=${IREE_CUDA_DISABLE:-1}
@@ -103,11 +105,7 @@
# These tests currently have asan failures
# TODO(#5715): Fix these
declare -a excluded_tests=(
- # Mysterious "LeakSanitizer has encountered a fatal error." crashes
"iree/samples/simple_embedding/simple_embedding_vulkan_test"
- "iree/tools/test/iree-benchmark-module.mlir.test"
- "iree/tools/test/iree-run-module.mlir.test"
- "iree/tools/test/multiple_exported_functions.mlir.test"
)
# Prefix with `^` anchor
diff --git a/integrations/tensorflow/WORKSPACE b/integrations/tensorflow/WORKSPACE
index 3cd4a16..410fd2f 100644
--- a/integrations/tensorflow/WORKSPACE
+++ b/integrations/tensorflow/WORKSPACE
@@ -7,7 +7,7 @@
load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
-TENSORFLOW_COMMIT = "58b667263ba5734ba175a7640f1a1d572e52bd3d"
+TENSORFLOW_COMMIT = "fe3fd49d08db3174730123cbab2fed8bbec9cf1b"
git_repository(
name = "org_tensorflow",
diff --git a/integrations/tensorflow/build_tools/bazel/iree-tf.bazelrc b/integrations/tensorflow/build_tools/bazel/iree-tf.bazelrc
index 2c26542..e20c24d 100644
--- a/integrations/tensorflow/build_tools/bazel/iree-tf.bazelrc
+++ b/integrations/tensorflow/build_tools/bazel/iree-tf.bazelrc
@@ -13,3 +13,4 @@
# Flags specific for working around tensorflow warnings.
build:generic_clang --copt=-Wno-inconsistent-missing-override --host_copt=-Wno-inconsistent-missing-override
+build:generic_clang --copt=-Wno-c++11-narrowing --host_copt=-Wno-c++11-narrowing
diff --git a/iree/compiler/Codegen/Common/TypePropagationPass.cpp b/iree/compiler/Codegen/Common/TypePropagationPass.cpp
index 851e64c..f120f46 100644
--- a/iree/compiler/Codegen/Common/TypePropagationPass.cpp
+++ b/iree/compiler/Codegen/Common/TypePropagationPass.cpp
@@ -238,7 +238,7 @@
if (outputType == legalizedOutputType) {
return rewriter.notifyMatchFailure(fillOp, "op already legal");
}
- Value value = adaptor.value();
+ Value value = adaptor.inputs().front();
Optional<Type> legalizedElementType =
getLegalizedElementType(value.getType());
if (!legalizedElementType) {
@@ -246,8 +246,8 @@
}
Value legalizedValue = convertElementType(
rewriter, fillOp->getLoc(), legalizedElementType.getValue(), value);
- rewriter.replaceOpWithNewOp<linalg::FillOp>(fillOp, legalizedValue,
- adaptor.output());
+ rewriter.replaceOpWithNewOp<linalg::FillOp>(
+ fillOp, ValueRange{legalizedValue}, ValueRange{adaptor.outputs()});
return success();
}
};
diff --git a/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir b/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir
index 6f2493f..0907d4c 100644
--- a/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir
+++ b/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir
@@ -74,7 +74,7 @@
%lhs_tile = flow.dispatch.tensor.load %lhs, offsets = [%iv0, 0], sizes = [%tilesize_y, %k], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%m, %k} -> tensor<?x?xf32>
%rhs_tile = flow.dispatch.tensor.load %rhs, offsets = [0, %iv1], sizes = [%k, %tilesize_x], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%k, %n} -> tensor<?x?xf32>
%init_tile = linalg.init_tensor [%tilesize_y, %tilesize_x] : tensor<?x?xf32>
- %fill_tile = linalg.fill(%cst, %init_tile) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %fill_tile = linalg.fill ins(%cst : f32) outs(%init_tile : tensor<?x?xf32>) -> tensor<?x?xf32>
%matmul_tile = linalg.matmul ins(%lhs_tile, %rhs_tile : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill_tile : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %matmul_tile, %result, offsets = [%iv0, %iv1], sizes = [%tilesize_y, %tilesize_x], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>{%m, %n}
}
@@ -90,7 +90,8 @@
// CHECK-DAG: %[[LHS_TILE:.+]] = flow.dispatch.tensor.load %[[LHS]]
// CHECK-DAG: %[[RHS_TILE:.+]] = flow.dispatch.tensor.load %[[RHS]]
// CHECK-DAG: %[[RESULT_TILE:.+]] = flow.dispatch.tensor.load %[[RESULT]]
-// CHECK: %[[FILL_TILE:.+]] = linalg.fill(%{{.+}}, %[[RESULT_TILE]])
+// CHECK: %[[FILL_TILE:.+]] = linalg.fill
+// CHECK-SAME: outs(%[[RESULT_TILE]] :
// CHECK: %[[MATMUL_TILE:.+]] = linalg.matmul
// CHECK-SAME: ins(%[[LHS_TILE]], %[[RHS_TILE]] : tensor<?x?xf32>, tensor<?x?xf32>)
// CHECK-SAME: outs(%[[FILL_TILE]] : tensor<?x?xf32>)
@@ -303,7 +304,7 @@
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c1, %c32, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x32x1024xf32> -> tensor<?x?x1024xf32>
%8 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [%c1, 1024, %c32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x1024x64xf32> -> tensor<?x1024x?xf32>
%9 = linalg.init_tensor [1, 32, 32] : tensor<1x32x32xf32>
- %10 = linalg.fill(%cst, %9) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<1x32x32xf32> -> tensor<1x32x32xf32>
+ %10 = linalg.fill {__internal_linalg_transform__ = "workgroup"} ins(%cst : f32) outs(%9 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>
%11 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup", is_root_op} ins(%7, %8 : tensor<?x?x1024xf32>, tensor<?x1024x?xf32>) outs(%10 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>
%12 = tensor.cast %11 : tensor<1x32x32xf32> to tensor<?x?x?xf32>
flow.dispatch.tensor.store %12, %2, offsets = [%arg0, %arg1, %arg2], sizes = [%c1, %c32, %c32], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:4x32x64xf32>
@@ -321,7 +322,8 @@
// CHECK-DAG: %[[LHS_TILE:.+]] = flow.dispatch.tensor.load %[[LHS]]
// CHECK-DAG: %[[RHS_TILE:.+]] = flow.dispatch.tensor.load %[[RHS]]
// CHECK-DAG: %[[RESULT_TILE:.+]] = flow.dispatch.tensor.load %[[RESULT]]
-// CHECK: %[[FILL_TILE:.+]] = linalg.fill(%{{.+}}, %[[RESULT_TILE]])
+// CHECK: %[[FILL_TILE:.+]] = linalg.fill
+// CHECK-SAME: outs(%[[RESULT_TILE]] :
// CHECK: %[[MATMUL_TILE:.+]] = linalg.batch_matmul
// CHECK-SAME: ins(%[[LHS_TILE]], %[[RHS_TILE]]
// CHECK-SAME: outs(%[[FILL_TILE]]
@@ -497,7 +499,7 @@
%8 = tensor.extract_slice %cst_0[%arg1] [64] [1] : tensor<64xf32> to tensor<64xf32>
%9 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:6400x64xf32> -> tensor<64x64xf32>
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:64x64xf32> -> tensor<64x64xf32>
- %11 = linalg.fill(%cst_1, %7) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
+ %11 = linalg.fill ins(%cst_1 : f32) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32>
%12 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [8, 32, 0], [0, 0, 16]]>} ins(%9, %10 : tensor<64x64xf32>, tensor<64x64xf32>) outs(%11 : tensor<64x64xf32>) -> tensor<64x64xf32>
%13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%8, %12 : tensor<64xf32>, tensor<64x64xf32>) outs(%7 : tensor<64x64xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
@@ -522,7 +524,8 @@
// CHECK-NOT: linalg.init_tensor
// CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[OUTPUT]]
// CHECK-NOT: linalg.init_tensor
-// CHECK: linalg.fill(%{{.+}}, %[[LOAD]])
+// CHECK: linalg.fill
+// CHECK-SAME: outs(%[[LOAD]] :
// CHECK: %[[MATMUL:.+]] = linalg.matmul
// CHECK: %[[GENERIC:.+]] = linalg.generic
// CHECK-SAME: outs(%[[MATMUL]] :
@@ -557,7 +560,7 @@
%12 = affine.min affine_map<(d0) -> (-d0 + 49, 16)>(%arg1)
%13 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [16, %12], strides = [1, 1] : !flow.dispatch.tensor<readonly:16x49xf32> -> tensor<16x?xf32>
%14 = linalg.init_tensor [%10, %12] : tensor<?x?xf32>
- %15 = linalg.fill(%cst, %14) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %15 = linalg.fill ins(%cst : f32) outs(%14 : tensor<?x?xf32>) -> tensor<?x?xf32>
%16 = linalg.matmul ins(%11, %13 : tensor<?x16xf32>, tensor<16x?xf32>) outs(%15 : tensor<?x?xf32>) -> tensor<?x?xf32>
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%16 : tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) {
^bb0(%arg2: f32, %arg3: f32):
diff --git a/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir b/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
index 5fb20f8..b9d84ee 100644
--- a/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
+++ b/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
@@ -99,7 +99,7 @@
%lhs_tile = flow.dispatch.tensor.load %lhs, offsets = [%iv0, 0], sizes = [%tilesize_y, %k], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%m, %k} -> tensor<?x?xf32>
%rhs_tile = flow.dispatch.tensor.load %rhs, offsets = [0, %iv1], sizes = [%k, %tilesize_x], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%k, %n} -> tensor<?x?xf32>
%init_tile = linalg.init_tensor [%tilesize_y, %tilesize_x] : tensor<?x?xf32>
- %fill_tile = linalg.fill(%cst, %init_tile) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %fill_tile = linalg.fill ins(%cst : f32) outs(%init_tile : tensor<?x?xf32>) -> tensor<?x?xf32>
%matmul_tile = linalg.matmul ins(%lhs_tile, %rhs_tile : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill_tile : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %matmul_tile, %result, offsets = [%iv0, %iv1], sizes = [%tilesize_y, %tilesize_x], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<readwrite:?x?xf32>{%m, %n}
}
@@ -136,7 +136,9 @@
// CHECK-DAG: %[[LHS_TILE:.+]] = memref.subview %[[LHS]][%[[IV0]], 0] [%[[TILESIZE_Y]], %[[K]]]
// CHECK-DAG: %[[RHS_TILE:.+]] = memref.subview %[[RHS]][0, %[[IV1]]] [%[[K]], %[[TILESIZE_X]]]
// CHECK-DAG: %[[RESULT_TILE:.+]] = memref.subview %[[RESULT]][%[[IV0]], %[[IV1]]] [%[[TILESIZE_Y]], %[[TILESIZE_X]]]
-// CHECK: linalg.fill(%[[CST]], %[[RESULT_TILE]])
+// CHECK: linalg.fill
+// CHECK-SAME: ins(%[[CST]] :
+// CHECK-SAME: outs(%[[RESULT_TILE]] :
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS_TILE]], %[[RHS_TILE]]
// CHECK-SAME: outs(%[[RESULT_TILE]]
diff --git a/iree/compiler/Codegen/Common/test/linalg_bufferize.mlir b/iree/compiler/Codegen/Common/test/linalg_bufferize.mlir
index a9eba4e..5e40783 100644
--- a/iree/compiler/Codegen/Common/test/linalg_bufferize.mlir
+++ b/iree/compiler/Codegen/Common/test/linalg_bufferize.mlir
@@ -736,7 +736,7 @@
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 3)>(%arg1)[%workgroup_size_x]
%12 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c2, %11], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:2x3xf32> -> tensor<2x?xf32>
%13 = linalg.init_tensor [%9, %11] : tensor<?x?xf32>
- %14 = linalg.fill(%cst, %13) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
%15 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%10, %12 : tensor<?x2xf32>, tensor<2x?xf32>) outs(%14 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %15, %2, offsets = [%arg0, %arg1], sizes = [%9, %11], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:1x3xf32>
}
@@ -754,7 +754,8 @@
// CHECK-DAG: %[[LHS_TILE:.+]] = memref.subview %[[RESHAPE_LHS]][%[[IV0]], 0]
// CHECK-DAG: %[[RESULT_TILE:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]]
// CHECK-DAG: %[[RHS_TILE:.+]] = memref.subview %[[RHS]][0, %[[IV1]]]
-// CHECK: linalg.fill(%{{.+}}, %[[RESULT_TILE]])
+// CHECK: linalg.fill
+// CHECK-SAME: outs(%[[RESULT_TILE]] :
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS_TILE]], %[[RHS_TILE]]
// CHECK-SAME: outs(%[[RESULT_TILE]]
@@ -920,7 +921,7 @@
%2 = linalg.init_tensor [3, 9] : tensor<3x9xi32>
%3 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%4 = tensor.extract %3[] : tensor<i32>
- %5 = linalg.fill(%4, %2) : i32, tensor<3x9xi32> -> tensor<3x9xi32>
+ %5 = linalg.fill ins(%4 : i32) outs(%2 : tensor<3x9xi32>) -> tensor<3x9xi32>
flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [3, 9], strides = [1, 1] : tensor<3x9xi32> -> !flow.dispatch.tensor<writeonly:3x9xi32>
return
}
@@ -929,7 +930,9 @@
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
// CHECK: %[[LOAD:.+]] = memref.load %[[ARG0]]
-// CHECK: linalg.fill(%[[LOAD]], %[[RET0]])
+// CHECK: linalg.fill
+// CHECK-SAME: ins(%[[LOAD]] :
+// CHECK-SAME: outs(%[[RET0]] :
// -----
@@ -992,7 +995,7 @@
%10 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 5)>(%arg1)[%workgroup_size_x]
%11 = tensor.extract_slice %cst[0, %arg1] [3, %10] [1, 1] : tensor<3x5xf32> to tensor<3x?xf32>
%12 = linalg.init_tensor [%8, %10] : tensor<?x?xf32>
- %13 = linalg.fill(%cst_0, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %13 = linalg.fill ins(%cst_0 : f32) outs(%12 : tensor<?x?xf32>) -> tensor<?x?xf32>
%14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%9, %11 : tensor<?x3xf32>, tensor<3x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %14, %1, offsets = [%arg0, %arg1], sizes = [%8, %10], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:5x5xf32>
}
@@ -1011,7 +1014,8 @@
// CHECK-DAG: %[[LHS_SUBVIEW:.+]] = memref.subview %[[LHS]][%[[IV0]], 0]
// CHECK-DAG: %[[RHS_SUBVIEW:.+]] = memref.subview %[[RHS]][0, %[[IV1]]]
// CHECK-DAG: %[[RESULT_SUBVIEW:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]]
-// CHECK: linalg.fill(%{{.+}}, %[[RESULT_SUBVIEW]])
+// CHECK: linalg.fill
+// CHECK-SAME: outs(%[[RESULT_SUBVIEW]] :
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS_SUBVIEW]], %[[RHS_SUBVIEW]]
// CHECK-SAME: outs(%[[RESULT_SUBVIEW]]
@@ -1067,7 +1071,7 @@
%5 = tensor.extract %4[] : tensor<f32>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1, 4, 6, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x4x6x1xf32> -> tensor<1x4x6x1xf32>
%7 = linalg.init_tensor [1, 2, 2, 1] : tensor<1x2x2x1xf32>
- %8 = linalg.fill(%5, %7) : f32, tensor<1x2x2x1xf32> -> tensor<1x2x2x1xf32>
+ %8 = linalg.fill ins(%5 : f32) outs(%7 : tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
%9 = linalg.pooling_nhwc_sum {
dilations = dense<1> : vector<2xi64>,
strides = dense<[2, 3]> : vector<2xi64>
@@ -1083,7 +1087,9 @@
// CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<1x4x6x1xf32>
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1x2x2x1xf32>
// CHECK: %[[INIT_VAL:.+]] = memref.load %[[INIT]][] : memref<f32>
-// CHECK: linalg.fill(%[[INIT_VAL]], %[[RET0]]) : f32, memref<1x2x2x1xf32>
+// CHECK: linalg.fill
+// CHECK-SAME: ins(%[[INIT_VAL]] :
+// CHECK-SAME: outs(%[[RET0]] :
// CHECK: linalg.pooling_nhwc_sum
// CHECK-SAME: dilations = dense<1> : vector<2xi64>
// CHECK-SAME: strides = dense<[2, 3]> : vector<2xi64>
@@ -1212,7 +1218,7 @@
%cst = arith.constant 0.0 : f32
%0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
- %1 = linalg.fill(%cst, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+ %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%2 = linalg.conv_2d_nhwc_hwcf
{dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)
@@ -1238,7 +1244,8 @@
// CHECK-NOT: memref.alloc
// CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer)
-// CHECK: linalg.fill(%{{.+}}, %[[OUTPUT]])
+// CHECK: linalg.fill
+// CHECK-SAME: outs(%[[OUTPUT]] :
// CHECK-NEXT: linalg.conv_2d_nhwc_hwcf
// CHECK-SAME: outs(%[[OUTPUT]] : memref<1x112x112x32xf32>)
// CHECK-NEXT: linalg.generic
@@ -1262,13 +1269,13 @@
%cst0 = arith.constant 0.0 : f32
%cst1 = arith.constant 1.0 : f32
%0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
- %1 = linalg.fill(%cst0, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+ %1 = linalg.fill ins(%cst0 : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%2 = linalg.conv_2d_nhwc_hwcf
{dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)
outs(%1 : tensor<1x112x112x32xf32>)
-> tensor<1x112x112x32xf32>
- %3 = linalg.fill(%cst1, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+ %3 = linalg.fill ins(%cst1 : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%4 = linalg.generic {
indexing_maps = [
affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
@@ -1289,10 +1296,12 @@
// CHECK-LABEL: func @dont_use_buffer_for_operand_when_output_tensor_used()
// CHECK: %[[ALLOC:.+]] = memref.alloc
// CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer)
-// CHECK: linalg.fill(%{{.+}}, %[[ALLOC]])
+// CHECK: linalg.fill
+// CHECK-SAME: outs(%[[ALLOC]] :
// CHECK-NEXT: linalg.conv_2d_nhwc_hwcf
// CHECK-SAME: outs(%[[ALLOC]] : memref<1x112x112x32xf32>)
-// CHECK-NEXT: linalg.fill(%{{.+}}, %[[OUTPUT]])
+// CHECK-NEXT: linalg.fill
+// CHECK-SAME: outs(%[[OUTPUT]] :
// CHECK-NEXT: linalg.generic
// CHECK-SAME: ins(%[[ALLOC]], %{{.+}} : memref<1x112x112x32xf32>, memref<32xf32>)
// CHECK-SAME: outs(%[[OUTPUT]] : memref<1x112x112x32xf32>)
@@ -1365,7 +1374,7 @@
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c1, %c32, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x32x1024xf32> -> tensor<?x?x1024xf32>
%8 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [%c1, 1024, %c32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x1024x64xf32> -> tensor<?x1024x?xf32>
%9 = linalg.init_tensor [1, 32, 32] : tensor<1x32x32xf32>
- %10 = linalg.fill(%cst, %9) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<1x32x32xf32> -> tensor<1x32x32xf32>
+ %10 = linalg.fill {__internal_linalg_transform__ = "workgroup"} ins(%cst : f32) outs(%9 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>
%11 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup", is_root_op} ins(%7, %8 : tensor<?x?x1024xf32>, tensor<?x1024x?xf32>) outs(%10 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>
%12 = tensor.cast %11 : tensor<1x32x32xf32> to tensor<?x?x?xf32>
flow.dispatch.tensor.store %12, %2, offsets = [%arg0, %arg1, %arg2], sizes = [%c1, %c32, %c32], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:4x32x64xf32>
@@ -1383,7 +1392,9 @@
// CHECK: %[[LHSV:.+]] = memref.subview %[[LHS]]
// CHECK: %[[RHSV:.+]] = memref.subview %[[RHS]]
// CHECK: %[[RESULTV:.+]] = memref.subview %[[RESULT]]
-// CHECK: linalg.fill(%[[ZERO]], %[[RESULTV]])
+// CHECK: linalg.fill
+// CHECK-SAME: ins(%[[ZERO]] :
+// CHECK-SAME: outs(%[[RESULTV]] :
// CHECK: linalg.batch_matmul {{.*}} ins(%[[LHSV]], %[[RHSV]] : {{.*}}) outs(%[[RESULTV]]
// -----
@@ -1621,7 +1632,7 @@
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [64, 27], strides = [1, 1] : !flow.dispatch.tensor<readonly:12544x27xf32> -> tensor<64x27xf32>
%8 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [27, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:27x16xf32> -> tensor<27x16xf32>
%9 = linalg.init_tensor [64, 16] : tensor<64x16xf32>
- %10 = linalg.fill(%cst, %9) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<64x16xf32> -> tensor<64x16xf32>
+ %10 = linalg.fill {__internal_linalg_transform__ = "workgroup"} ins(%cst : f32) outs(%9 : tensor<64x16xf32>) -> tensor<64x16xf32>
%11 = tensor.pad %7 low[0, 0] high[0, 5] {
^bb0(%arg2: index, %arg3: index): // no predecessors
tensor.yield %cst : f32
@@ -1639,23 +1650,29 @@
}
// CHECK-LABEL: func @padded_matmul()
-// CHECK-DAG: %[[LHS_PADDED:.+]] = memref.alloc() : memref<64x32xf32>
-// CHECK-DAG: %[[RHS_PADDED:.+]] = memref.alloc() : memref<32x16xf32>
-// CHECK-DAG: %[[C0:.+]] = arith.constant 0.000000e+00 : f32
-// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12544x27xf32>
-// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<27x16xf32>
-// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<12544x16xf32>
-// CHECK-DAG: %[[LHS_V:.+]] = memref.subview %[[LHS]][%{{.*}}, 0] [64, 27] [1, 1]
-// CHECK-DAG: %[[RHS_V:.+]] = memref.subview %[[RHS]][0, %{{.*}}] [27, 16] [1, 1]
-// CHECK-DAG: %[[DST_V:.+]] = memref.subview %[[DST]][%{{.*}}, %{{.*}}] [64, 16] [1, 1]
-// CHECK: linalg.fill(%[[C0]], %[[DST_V]])
-// CHECK: linalg.fill(%[[C0]], %[[LHS_PADDED]]) : f32, memref<64x32xf32>
-// CHECK: %[[LHS_PADDED_INTER:.+]] = memref.subview %[[LHS_PADDED]][0, 0] [64, 27] [1, 1]
-// CHECK: linalg.generic {{.*}} ins(%[[LHS_V]] {{.*}} outs(%[[LHS_PADDED_INTER]]
-// CHECK: linalg.fill(%[[C0]], %[[RHS_PADDED]]) : f32, memref<32x16xf32>
-// CHECK: %[[RHS_PADDED_INTER:.+]] = memref.subview %[[RHS_PADDED]][0, 0] [27, 16] [1, 1]
-// CHECK: linalg.generic {{.*}} ins(%[[RHS_V]] {{.*}} outs(%[[RHS_PADDED_INTER]]
-// CHECK: linalg.matmul ins(%[[LHS_PADDED]], %[[RHS_PADDED]] : memref<64x32xf32>, memref<32x16xf32>)
+// CHECK-DAG: %[[LHS_PADDED:.+]] = memref.alloc() : memref<64x32xf32>
+// CHECK-DAG: %[[RHS_PADDED:.+]] = memref.alloc() : memref<32x16xf32>
+// CHECK-DAG: %[[C0:.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12544x27xf32>
+// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<27x16xf32>
+// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<12544x16xf32>
+// CHECK-DAG: %[[LHS_V:.+]] = memref.subview %[[LHS]][%{{.*}}, 0] [64, 27] [1, 1]
+// CHECK-DAG: %[[RHS_V:.+]] = memref.subview %[[RHS]][0, %{{.*}}] [27, 16] [1, 1]
+// CHECK-DAG: %[[DST_V:.+]] = memref.subview %[[DST]][%{{.*}}, %{{.*}}] [64, 16] [1, 1]
+// CHECK: linalg.fill
+// CHECK-SAME: ins(%[[C0]] :
+// CHECK-SAME: outs(%[[DST_V]] :
+// CHECK: linalg.fill
+// CHECK-SAME: ins(%[[C0]] :
+// CHECK-SAME: outs(%[[LHS_PADDED]] :
+// CHECK: %[[LHS_PADDED_INTER:.+]] = memref.subview %[[LHS_PADDED]][0, 0] [64, 27] [1, 1]
+// CHECK: linalg.generic {{.*}} ins(%[[LHS_V]] {{.*}} outs(%[[LHS_PADDED_INTER]]
+// CHECK: linalg.fill
+// CHECK-SAME: ins(%[[C0]] :
+// CHECK-SAME: outs(%[[RHS_PADDED]] :
+// CHECK: %[[RHS_PADDED_INTER:.+]] = memref.subview %[[RHS_PADDED]][0, 0] [27, 16] [1, 1]
+// CHECK: linalg.generic {{.*}} ins(%[[RHS_V]] {{.*}} outs(%[[RHS_PADDED_INTER]]
+// CHECK: linalg.matmul ins(%[[LHS_PADDED]], %[[RHS_PADDED]] : memref<64x32xf32>, memref<32x16xf32>)
// -----
@@ -1695,7 +1712,7 @@
tensor.yield %cst : f32
} : tensor<2x?xf32> to tensor<4x4xf32>
%15 = linalg.init_tensor [4, 4] : tensor<4x4xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<4x4xf32>) -> tensor<4x4xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%13, %14 : tensor<4x4xf32>, tensor<4x4xf32>) outs(%16 : tensor<4x4xf32>) -> tensor<4x4xf32>
%18 = tensor.extract_slice %17[0, 0] [%7, %9] [1, 1] : tensor<4x4xf32> to tensor<?x?xf32>
flow.dispatch.tensor.store %18, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>{%m, %n}
@@ -1720,12 +1737,15 @@
// CHECK-DAG: %[[TILE_N:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[N]]]
// CHECK-DAG: %[[ARG0_SV:.+]] = memref.subview %[[ARG0]]
// CHECK-DAG: %[[ARG1_SV:.+]] = memref.subview %[[ARG1]]
-// CHECK: linalg.fill(%{{.*}}, %[[ALLOC_ARG0]]
+// CHECK: linalg.fill
+// CHECK-SAME: outs(%[[ALLOC_ARG0]] :
// CHECK: %[[ALLOC_ARG0_SV:.+]] = memref.subview %[[ALLOC_ARG0]]
// CHECK: linalg.generic {{.*}} ins(%[[ARG0_SV]] {{.*}} outs(%[[ALLOC_ARG0_SV]]
-// CHECK: linalg.fill(%{{.*}}, %[[ALLOC_ARG1]]
+// CHECK: linalg.fill
+// CHECK-SAME: outs(%[[ALLOC_ARG1]] :
// CHECK: linalg.generic {{.*}} ins(%[[ARG1_SV]]
-// CHECK: linalg.fill(%{{.*}}, %[[ALLOC_RET0]]
+// CHECK: linalg.fill
+// CHECK-SAME: outs(%[[ALLOC_RET0]] :
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[ALLOC_ARG0]], %[[ALLOC_ARG1]]
// CHECK-SAME: outs(%[[ALLOC_RET0]]
@@ -1756,8 +1776,8 @@
%7 = flow.dispatch.tensor.load %0, offsets = [0, %arg0], sizes = [%d0, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xi32>{%d0, %d1} -> tensor<?x?xi32>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg0], sizes = [%d0, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xi32>{%d0, %d1} -> tensor<?x?xi32>
%13 = linalg.init_tensor [%6] : tensor<?xi32>
- %14 = linalg.fill(%c-2147483648_i32, %13) {__internal_linalg_transform__ = "workgroup", lowering_config = {tileSizes = [[128]]}} : i32, tensor<?xi32> -> tensor<?xi32>
- %17 = linalg.fill(%c0_i32, %13) {__internal_linalg_transform__ = "workgroup", lowering_config = {tileSizes = [[128]]}} : i32, tensor<?xi32> -> tensor<?xi32>
+ %14 = linalg.fill {__internal_linalg_transform__ = "workgroup", lowering_config = {tileSizes = [[128]]}} ins(%c-2147483648_i32 : i32) outs(%13 : tensor<?xi32>) -> tensor<?xi32>
+ %17 = linalg.fill {__internal_linalg_transform__ = "workgroup", lowering_config = {tileSizes = [[128]]}} ins(%c0_i32 : i32) outs(%13 : tensor<?xi32>) -> tensor<?xi32>
%18:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%7, %9 : tensor<?x?xi32>, tensor<?x?xi32>) outs(%14, %17 : tensor<?xi32>, tensor<?xi32>) attrs = {__internal_linalg_transform__ = "workgroup", lowering_config = {tileSizes = [[128]]}} {
^bb0(%arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32): // no predecessors
%19 = arith.cmpi sge, %arg1, %arg3 : i32
@@ -1784,9 +1804,11 @@
// CHECK-DAG: %[[ARG0_SV:.+]] = memref.subview %[[ARG0]]
// CHECK-DAG: %[[ARG1_SV:.+]] = memref.subview %[[ARG1]]
// CHECK-DAG: %[[RET0_SV:.+]] = memref.subview %[[RET0]]
-// CHECK-DAG: linalg.fill(%{{.*}}, %[[RET0_SV]]
+// CHECK-DAG: linalg.fill
+// CHECK-SAME: outs(%[[RET0_SV]] :
// CHECK-DAG: %[[RET1_SV:.+]] = memref.subview %[[RET1]]
-// CHECK-DAG: linalg.fill(%{{.*}}, %[[RET1_SV]]
+// CHECK-DAG: linalg.fill
+// CHECK-SAME: outs(%[[RET1_SV]] :
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[ARG0_SV]], %[[ARG1_SV]]
// CHECK-SAME: outs(%[[RET0_SV]], %[[RET1_SV]]
@@ -2020,7 +2042,7 @@
%9 = affine.min #map2(%arg1)
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [144, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:144x370xf32> -> tensor<144x?xf32>
%11 = linalg.init_tensor [%7, %9] : tensor<?x?xf32>
- %12 = linalg.fill(%cst, %11) {__internal_linalg_transform__ = "workgroup", lowering_config = #config0} : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %12 = linalg.fill {__internal_linalg_transform__ = "workgroup", lowering_config = #config0} ins(%cst : f32) outs(%11 : tensor<?x?xf32>) -> tensor<?x?xf32>
%13 = scf.for %arg2 = %c0 to %c250 step %c32 iter_args(%arg3 = %12) -> (tensor<?x?xf32>) {
%14 = scf.for %arg4 = %c0 to %c370 step %c32 iter_args(%arg5 = %arg3) -> (tensor<?x?xf32>) {
%15 = scf.for %arg6 = %c0 to %c144 step %c24 iter_args(%arg7 = %arg5) -> (tensor<?x?xf32>) {
@@ -2309,7 +2331,7 @@
%13 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%9, %k], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%m, %k} -> tensor<?x?xf32>
%15 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [%k, %10], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%k, %n} -> tensor<?x?xf32>
%16 = linalg.init_tensor [%9, %10] : tensor<?x?xf32>
- %17 = linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %17 = linalg.fill {__internal_linalg_transform__ = "workgroup"} ins(%cst : f32) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
%18 = scf.for %arg2 = %c0 to %9 step %c4 iter_args(%arg3 = %17) -> (tensor<?x?xf32>) {
%20 = scf.for %arg4 = %c0 to %10 step %c4 iter_args(%arg5 = %arg3) -> (tensor<?x?xf32>) {
%21 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%9, %arg2)
@@ -2360,7 +2382,8 @@
// CHECK-DAG: %[[LHS_SUBVIEW1:.+]] = memref.subview %[[LHS]]
// CHECK-DAG: %[[RHS_SUBVIEW1:.+]] = memref.subview %[[RHS]]
// CHECK-DAG: %[[OUT_SUBVIEW1:.+]] = memref.subview %[[OUT]]
-// CHECK: linalg.fill(%{{.+}}, %[[OUT_SUBVIEW1]])
+// CHECK: linalg.fill
+// CHECK-SAME: outs(%[[OUT_SUBVIEW1]] :
// CHECK: scf.for
// CHECK: scf.for
// CHECK: %[[OUT_SUBVIEW2:.+]] = memref.subview %[[OUT_SUBVIEW1]]
@@ -2410,7 +2433,7 @@
%14 = affine.min affine_map<(d0) -> (-d0 + 1, 4)>(%arg1)
%15 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [3, %14], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32>{%k, %n} -> tensor<?x?xf32>
%16 = linalg.init_tensor [%12, %14] : tensor<?x?xf32>
- %17 = linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %17 = linalg.fill {__internal_linalg_transform__ = "workgroup"} ins(%cst : f32) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
%18 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
ins(%13, %15 : tensor<?x?xf32>, tensor<?x?xf32>)
outs(%17: tensor<?x?xf32>) -> tensor<?x?xf32>
@@ -2438,7 +2461,8 @@
// CHECK-DAG: %[[LHS_SUBVIEW1:.+]] = memref.subview %[[LHS]]
// CHECK-DAG: %[[RHS_SUBVIEW1:.+]] = memref.subview %[[RHS]]
// CHECK-DAG: %[[OUT_SUBVIEW1:.+]] = memref.subview %[[OUT]]
-// CHECK: linalg.fill(%{{.+}}, %[[OUT_SUBVIEW1]])
+// CHECK: linalg.fill
+// CHECK-SAME: outs(%[[OUT_SUBVIEW1]] :
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS_SUBVIEW1]], %[[RHS_SUBVIEW1]] :
// CHECK-SAME: outs(%[[OUT_SUBVIEW1]] :
@@ -2486,7 +2510,7 @@
%ts_2_m = affine.min #map_min(%iv2)[%c16, %ts_m]
%ts_2_n = affine.min #map_min(%iv3)[%c8, %ts_n]
%tile_init_2 = linalg.init_tensor [%ts_2_m, %ts_2_n] : tensor<?x?xf32>
- %fill_tile_2 = linalg.fill(%cst, %tile_init_2) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %fill_tile_2 = linalg.fill ins(%cst : f32) outs(%tile_init_2 : tensor<?x?xf32>) -> tensor<?x?xf32>
%lhs_tile_2 = tensor.extract_slice %lhs_tile[%iv2, 0] [%ts_2_m, %k] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%rhs_tile_2 = tensor.extract_slice %rhs_tile[0, %iv3] [%k, %ts_2_n] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%matmul_tile_2 = linalg.matmul
@@ -2528,7 +2552,8 @@
// CHECK: scf.for %[[IV2:[a-zA-Z0-9]+]] =
// CHECK: scf.for %[[IV3:[a-zA-Z0-9]+]] =
// CHECK: %[[RESULT_TILE_2:.+]] = memref.subview %[[RESULT_TILE]][%[[IV2]], %[[IV3]]]
-// CHECK: linalg.fill(%{{.+}}, %[[RESULT_TILE_2]])
+// CHECK: linalg.fill
+// CHECK-SAME: outs(%[[RESULT_TILE_2]] :
// CHECK-DAG: %[[LHS_TILE_2:.+]] = memref.subview %[[LHS_TILE]][%[[IV2]], 0]
// CHECK-DAG: %[[RHS_TILE_2:.+]] = memref.subview %[[RHS_TILE]][0, %[[IV3]]]
// CHECK: linalg.matmul
diff --git a/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir b/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
index bb8498b..8aef8cb 100644
--- a/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
+++ b/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
@@ -266,7 +266,7 @@
%8 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0], sizes = [%0, %3, %2], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:?x?x?xf32>{%0, %3, %2} -> tensor<?x?x?xf32>
%9 = linalg.init_tensor [%0, %1, %2] : tensor<?x?x?xf32>
- %10 = linalg.fill(%cst, %9) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
+ %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
%11 = linalg.batch_matmul {lowering_config = #config}
ins(%7, %8 : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%10 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
flow.dispatch.tensor.store %11, %6, offsets = [0, 0, 0], sizes = [%0, %1, %2], strides = [1, 1, 1]
@@ -323,7 +323,7 @@
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 512], strides = [1, 1]
: !flow.dispatch.tensor<readonly:256x512xf32> -> tensor<256x512xf32>
%5 = linalg.init_tensor [128, 512] : tensor<128x512xf32>
- %6 = linalg.fill(%cst, %5) : f32, tensor<128x512xf32> -> tensor<128x512xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x512xf32>) -> tensor<128x512xf32>
%7 = linalg.matmul {lowering_config = #config}
ins(%3, %4 : tensor<128x256xf32>, tensor<256x512xf32>) outs(%6 : tensor<128x512xf32>) -> tensor<128x512xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1]
@@ -352,7 +352,8 @@
// CHECK-DAG: %[[LHS:.+]] = flow.dispatch.tensor.load %{{.+}}, offsets = [%[[IV0]], 0], sizes = [32, 256]
// CHECK-DAG: %[[RHS:.+]] = flow.dispatch.tensor.load %{{.+}}, offsets = [0, %[[IV1]]], sizes = [256, 16]
// CHECK-DAG: %[[INIT:.+]] = linalg.init_tensor [32, 16]
-// CHECK-DAG: %[[FILL:.+]] = linalg.fill(%{{.+}}, %[[INIT]])
+// CHECK-DAG: %[[FILL:.+]] = linalg.fill
+// CHECK-SAME: outs(%[[INIT]] :
// CHECK-DAG: %[[GEMM:.+]] = linalg.matmul
// CHECK-SAME: outs(%[[FILL]] :
// CHECK: flow.dispatch.tensor.store %[[GEMM]]
@@ -787,7 +788,7 @@
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 96], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:3x3x96xf32> -> tensor<3x3x96xf32>
%5 = linalg.init_tensor [1, 80, 80, 96] : tensor<1x80x80x96xf32>
- %6 = linalg.fill(%cst, %5) : f32, tensor<1x80x80x96xf32> -> tensor<1x80x80x96xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x80x80x96xf32>) -> tensor<1x80x80x96xf32>
%7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #config, strides = dense<2> : tensor<2xi64>}
ins(%3, %4 : tensor<1x161x161x96xf32>, tensor<3x3x96xf32>) outs(%6 : tensor<1x80x80x96xf32>) -> tensor<1x80x80x96xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 80, 80, 96], strides = [1, 1, 1, 1]
@@ -816,7 +817,8 @@
// CHECK: scf.for %[[IV1:.+]] =
// CHECK: scf.for %[[IV2:.+]] =
// CHECK: %[[INIT:.+]] = linalg.init_tensor [1, 20, 40, 48]
-// CHECK: %[[FILL:.+]] = linalg.fill(%{{.+}}, %[[INIT]])
+// CHECK: %[[FILL:.+]] = linalg.fill
+// CHECK-SAME: outs(%[[INIT]] :
// CHECK: %[[RESULT:.+]] = linalg.depthwise_conv_2d_nhwc_hwc
// CHECK-SAME: outs(%[[FILL]] :
// CHECK: flow.dispatch.tensor.store %[[RESULT]], %{{.+}}, offsets = [0, %[[IV0]], %[[IV1]], %[[IV2]]], sizes = [1, 20, 40, 48]
@@ -913,7 +915,7 @@
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [240, 40], strides = [1, 1]
: !flow.dispatch.tensor<readonly:240x40xf32> -> tensor<240x40xf32>
%5 = linalg.init_tensor [196, 40] : tensor<196x40xf32>
- %6 = linalg.fill(%cst, %5) : f32, tensor<196x40xf32> -> tensor<196x40xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<196x40xf32>) -> tensor<196x40xf32>
%7 = linalg.matmul {lowering_config = #config}
ins(%3, %4 : tensor<196x240xf32>, tensor<240x40xf32>) outs(%6 : tensor<196x40xf32>) -> tensor<196x40xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [196, 40], strides = [1, 1]
@@ -969,7 +971,7 @@
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [5, 5, 576], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:5x5x576xf32> -> tensor<5x5x576xf32>
%5 = linalg.init_tensor [1, 7, 7, 576] : tensor<1x7x7x576xf32>
- %6 = linalg.fill(%cst, %5) : f32, tensor<1x7x7x576xf32> -> tensor<1x7x7x576xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32>
%7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #config, strides = dense<1> : tensor<2xi64>}
ins(%3, %4 : tensor<1x11x11x576xf32>, tensor<5x5x576xf32>) outs(%6 : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 7, 7, 576], strides = [1, 1, 1, 1]
@@ -1021,7 +1023,7 @@
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [7, 7, 2048], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:7x7x2048xf32> -> tensor<7x7x2048xf32>
%1 = linalg.init_tensor [7] : tensor<7xf32>
- %2 = linalg.fill(%cst, %1) : f32, tensor<7xf32> -> tensor<7xf32>
+ %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<7xf32>) -> tensor<7xf32>
%3 = linalg.generic {
indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction", "reduction"]}
ins(%0 : tensor<7x7x2048xf32>) outs(%2 : tensor<7xf32>) attrs = {lowering_config = #config} {
@@ -1058,7 +1060,8 @@
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: %[[INIT0:.+]] = linalg.init_tensor
// CHECK: %[[INIT:.+]] = linalg.init_tensor
-// CHECK: %[[FILL:.+]] = linalg.fill(%{{.+}}, %[[INIT]])
+// CHECK: %[[FILL:.+]] = linalg.fill
+// CHECK-SAME: outs(%[[INIT]] :
// CHECK: %[[REDUCE:.+]] = linalg.generic
// CHECK-SAME: outs(%[[FILL]] :
// CHECK: %[[GENERIC:.+]] = linalg.generic
diff --git a/iree/compiler/Codegen/Common/test/type_propagation.mlir b/iree/compiler/Codegen/Common/test/type_propagation.mlir
index 75ea3d6..a56c31e 100644
--- a/iree/compiler/Codegen/Common/test/type_propagation.mlir
+++ b/iree/compiler/Codegen/Common/test/type_propagation.mlir
@@ -225,7 +225,7 @@
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<writeonly:?xi8>{%d}
%1 = linalg.init_tensor [%d] : tensor<?xi1>
%false = arith.constant false
- %2 = linalg.fill(%false, %1) : i1, tensor<?xi1> -> tensor<?xi1>
+ %2 = linalg.fill ins(%false : i1) outs(%1 : tensor<?xi1>) -> tensor<?xi1>
%3 = arith.extui %2 : tensor<?xi1> to tensor<?xi8>
flow.dispatch.tensor.store %3, %0, offsets=[0], sizes=[%d], strides=[1] : tensor<?xi8> -> !flow.dispatch.tensor<writeonly:?xi8>{%d}
return
@@ -235,5 +235,7 @@
// CHECK-DAG: %[[INIT:.+]] = linalg.init_tensor
// CHECK-DAG: %[[FALSE:.+]] = arith.constant false
// CHECK-DAG: %[[EXT_SCALAR:.+]] = arith.extui %[[FALSE]]
-// CHECK: %[[FILL:.+]] = linalg.fill(%[[EXT_SCALAR]], %[[INIT]])
+// CHECK: %[[FILL:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[EXT_SCALAR]] :
+// CHECK-SAME: outs(%[[INIT]] :
// CHECK: flow.dispatch.tensor.store %[[FILL]], %[[OUT]]
diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 726cdd8..db883f2 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -99,6 +99,33 @@
return getVectorSize(entryPointFn, byteWidth);
}
+/// Returns minimum tiling sizes for each dimension. One dimension is possible
+/// to access at different element types. It determines the tiling sizes by
+/// looking into all the operands.
+static SmallVector<int64_t> getMinTilingSizesForEachDim(FuncOp entryPointFn,
+ linalg::LinalgOp op) {
+ unsigned numLoops = op.getNumLoops();
+ SmallVector<int64_t> minTileSizes(numLoops, 1);
+ auto inputOutputOpOperands = op.getInputAndOutputOperands();
+ for (auto map : llvm::enumerate(op.getIndexingMaps())) {
+ // Check the fastest varying dimension of the operand. Set the vector size
+ // of the corresponding loop to the vector size.
+ if (map.value().getNumResults() == 0) continue;
+ auto fastestVaryingDimExpr =
+ map.value().getResults().back().dyn_cast<AffineDimExpr>();
+ if (!fastestVaryingDimExpr) continue;
+ unsigned fastestVaryingDim = fastestVaryingDimExpr.getPosition();
+
+ // If the indexing map has result it has to be a shaped type.
+ auto operandType =
+ inputOutputOpOperands[map.index()]->get().getType().cast<ShapedType>();
+ minTileSizes[fastestVaryingDim] =
+ std::max<int64_t>(minTileSizes[fastestVaryingDim],
+ getVectorSize(entryPointFn, operandType));
+ }
+ return minTileSizes;
+}
+
/// Returns the type length in bytes. Looks through all the interface binding
/// ops to see the ABI types and guess-timates the type size to use. This is
/// used to convert the vector size in bytes to vector size in number of
@@ -409,11 +436,20 @@
FuncOp entryPointFn, linalg::ContractionOpInterface contractionOp,
ArrayRef<LoopTilingAndDistributionInfo> tiledLoops) {
auto linalgOp = cast<linalg::LinalgOp>(contractionOp.getOperation());
+ // Consider all element types and use the smallest vector size. The tiling
+ // sizes are chosen based on the vector size.
auto lhsShapedType = contractionOp.lhs().getType().cast<ShapedType>();
+ auto rhsShapedType = contractionOp.rhs().getType().cast<ShapedType>();
+ auto resShapedType =
+ linalgOp.getOutputOperand(0)->get().getType().cast<ShapedType>();
+ int64_t vectorSize = getVectorSize(entryPointFn, lhsShapedType);
+ vectorSize = std::min(vectorSize, getVectorSize(entryPointFn, rhsShapedType));
+ vectorSize = std::min(vectorSize, getVectorSize(entryPointFn, resShapedType));
+
// Use the default distribution for the matmul loops.
unsigned numLoops = linalgOp.getNumLoops();
- int64_t vectorSize = getVectorSize(entryPointFn, lhsShapedType);
- SmallVector<int64_t> minTileSizes(numLoops, vectorSize);
+ SmallVector<int64_t> minTileSizes =
+ getMinTilingSizesForEachDim(entryPointFn, linalgOp);
SmallVector<int64_t> maxTileSizes(numLoops, defaultWorkgroupTileSize);
if (numLoops > 3) {
minTileSizes[0] = 1;
@@ -539,25 +575,9 @@
unsigned numLoops = genericOp.getNumLoops();
if (numLoops == 0) return success();
- SmallVector<int64_t> minTileSizes(numLoops, 1),
- maxTileSizes(numLoops, defaultWorkgroupTileSize);
- auto inputOutputOpOperands = genericOp.getInputAndOutputOperands();
- for (auto map : llvm::enumerate(genericOp.getIndexingMaps())) {
- // Check the fastest varying dimension of the operand. Set the vector size
- // of the corresponding loop to the vector size.
- if (map.value().getNumResults() == 0) continue;
- auto fastestVaryingDimExpr =
- map.value().getResults().back().dyn_cast<AffineDimExpr>();
- if (!fastestVaryingDimExpr) continue;
- unsigned fastestVaryingDim = fastestVaryingDimExpr.getPosition();
-
- // If the indexing map has result it has to be a shaped type.
- auto operandType =
- inputOutputOpOperands[map.index()]->get().getType().cast<ShapedType>();
- minTileSizes[fastestVaryingDim] =
- std::max<int64_t>(minTileSizes[fastestVaryingDim],
- getVectorSize(entryPointFn, operandType));
- }
+ SmallVector<int64_t> minTileSizes =
+ getMinTilingSizesForEachDim(entryPointFn, genericOp);
+ SmallVector<int64_t> maxTileSizes(numLoops, defaultWorkgroupTileSize);
if (llvm::all_of(minTileSizes, [](int64_t vs) { return vs == 1; })) {
// Nothing to vectorize just lower to loops.
return success();
diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index 593810f..4f219c4 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
@@ -193,7 +193,7 @@
%rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0, 0], sizes = [%B, %K, %N], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:?x?x?xf32>{%B, %K, %N} -> tensor<?x?x?xf32>
%init = linalg.init_tensor [%B, %M, %N] : tensor<?x?x?xf32>
- %fill = linalg.fill(%cst, %init) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
+ %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
%batch_gemm = linalg.batch_matmul
ins(%lhs, %rhs : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%fill : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
flow.dispatch.tensor.store %batch_gemm, %result_binding, offsets = [0, 0, 0], sizes = [%B, %M, %N], strides = [1, 1, 1]
@@ -240,7 +240,7 @@
%rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [256, 512], strides = [1, 1]
: !flow.dispatch.tensor<readonly:256x512xf32> -> tensor<256x512xf32>
%init = linalg.init_tensor [128, 512] : tensor<128x512xf32>
- %fill = linalg.fill(%cst, %init) : f32, tensor<128x512xf32> -> tensor<128x512xf32>
+ %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<128x512xf32>) -> tensor<128x512xf32>
%gemm = linalg.matmul {compilation_info = #compilation}
ins(%lhs, %rhs : tensor<128x256xf32>, tensor<256x512xf32>)
outs(%fill : tensor<128x512xf32>) -> tensor<128x512xf32>
@@ -570,7 +570,7 @@
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x225x225x3xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x16xf32> -> tensor<3x3x3x16xf32>
%5 = linalg.init_tensor [1, 112, 112, 16] : tensor<1x112x112x16xf32>
- %6 = linalg.fill(%cst, %5) : f32, tensor<1x112x112x16xf32> -> tensor<1x112x112x16xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
%7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 16], strides = [1, 1, 1, 1] : tensor<1x112x112x16xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x16xf32>
return
@@ -615,7 +615,7 @@
%filter = flow.dispatch.tensor.load %filter_binding, offsets = [0, 0, 0], sizes = [3, 3, 96], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:3x3x96xf32> -> tensor<3x3x96xf32>
%init = linalg.init_tensor [1, 80, 80, 96] : tensor<1x80x80x96xf32>
- %fill = linalg.fill(%cst, %init) : f32, tensor<1x80x80x96xf32> -> tensor<1x80x80x96xf32>
+ %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x80x80x96xf32>) -> tensor<1x80x80x96xf32>
%conv = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%input, %filter : tensor<1x161x161x96xf32>, tensor<3x3x96xf32>) outs(%fill : tensor<1x80x80x96xf32>) -> tensor<1x80x80x96xf32>
flow.dispatch.tensor.store %conv, %result_binding, offsets = [0, 0, 0, 0], sizes = [1, 80, 80, 96], strides = [1, 1, 1, 1]
@@ -707,7 +707,7 @@
%rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [240, 40], strides = [1, 1]
: !flow.dispatch.tensor<readonly:240x40xf32> -> tensor<240x40xf32>
%init = linalg.init_tensor [196, 40] : tensor<196x40xf32>
- %fill = linalg.fill(%cst, %init) : f32, tensor<196x40xf32> -> tensor<196x40xf32>
+ %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<196x40xf32>) -> tensor<196x40xf32>
%gemm = linalg.matmul ins(%lhs, %rhs : tensor<196x240xf32>, tensor<240x40xf32>)
outs(%fill : tensor<196x40xf32>) -> tensor<196x40xf32>
flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [196, 40], strides = [1, 1]
@@ -718,7 +718,7 @@
}
}
-// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[28, 8, 0], [4, 4, 60], [4, 4, 4]{{\]}}>
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[49, 8, 0], [7, 4, 60], [4, 4, 4]{{\]}}>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUTileFuseAndVectorize>
// CHECK: hal.executable.entry_point public @matmul_static
// CHECK-SAME: translation_info = #[[TRANSLATION]]
@@ -755,7 +755,7 @@
%filter = flow.dispatch.tensor.load %filter_binding, offsets = [0, 0, 0], sizes = [5, 5, 576], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:5x5x576xf32> -> tensor<5x5x576xf32>
%init = linalg.init_tensor [1, 7, 7, 576] : tensor<1x7x7x576xf32>
- %fill = linalg.fill(%cst, %init) : f32, tensor<1x7x7x576xf32> -> tensor<1x7x7x576xf32>
+ %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32>
%conv = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
ins(%input, %filter : tensor<1x11x11x576xf32>, tensor<5x5x576xf32>)
outs(%fill : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32>
@@ -803,7 +803,7 @@
%rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [512, 128], strides = [1, 1]
: !flow.dispatch.tensor<readonly:512x128xf32> -> tensor<512x128xf32>
%init = linalg.init_tensor [384, 128] : tensor<384x128xf32>
- %fill = linalg.fill(%cst, %init) : f32, tensor<384x128xf32> -> tensor<384x128xf32>
+ %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<384x128xf32>) -> tensor<384x128xf32>
%gemm = linalg.matmul ins(%lhs, %rhs : tensor<384x512xf32>, tensor<512x128xf32>)
outs(%fill : tensor<384x128xf32>) -> tensor<384x128xf32>
flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [384, 128], strides = [1, 1]
@@ -851,7 +851,7 @@
%rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [512, 128], strides = [1, 1]
: !flow.dispatch.tensor<readonly:512x128xf32> -> tensor<512x128xf32>
%init = linalg.init_tensor [384, 128] : tensor<384x128xf32>
- %fill = linalg.fill(%cst, %init) : f32, tensor<384x128xf32> -> tensor<384x128xf32>
+ %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<384x128xf32>) -> tensor<384x128xf32>
%gemm = linalg.matmul ins(%lhs, %rhs : tensor<384x512xf32>, tensor<512x128xf32>)
outs(%fill : tensor<384x128xf32>) -> tensor<384x128xf32>
flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [384, 128], strides = [1, 1]
@@ -896,7 +896,7 @@
%input = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [7, 7, 2048], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:7x7x2048xf32> -> tensor<7x7x2048xf32>
%init = linalg.init_tensor [7] : tensor<7xf32>
- %fill = linalg.fill(%cst, %init) : f32, tensor<7xf32> -> tensor<7xf32>
+ %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<7xf32>) -> tensor<7xf32>
%reduce = linalg.generic {
indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>],
iterator_types = ["parallel", "reduction", "reduction"]}
@@ -936,7 +936,7 @@
#hal.descriptor_set.binding<2, storage_buffer>
]>
]>
-hal.executable private @matmul_i8_i8_i32 {
+hal.executable private @matmul_x86_i8_i8_i32 {
hal.executable.variant public @embedded_elf_x86_64, target = #hal.executable.target<
"llvm",
"embedded-elf-x86_64", {
@@ -944,9 +944,9 @@
native_vector_size = 4 : index,
target_triple = "x86_64-unknown-unknown-eabi-elf"
}> {
- hal.executable.entry_point public @matmul_i8_i8_i32 layout(#executable_layout)
+ hal.executable.entry_point public @matmul_x86_i8_i8_i32 layout(#executable_layout)
builtin.module {
- func @matmul_i8_i8_i32() {
+ func @matmul_x86_i8_i8_i32() {
%c0 = arith.constant 0 : index
%M = hal.interface.constant.load[0] : index
%N = hal.interface.constant.load[1] : index
@@ -974,7 +974,57 @@
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, 32, 0], [0, 0, 16]{{\]}}>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.entry_point public @matmul_i8_i8_i32
+// CHECK: hal.executable.entry_point public @matmul_x86_i8_i8_i32
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
+// CHECK: linalg.matmul
+// CHECK-SAME: lowering_config = #[[CONFIG]]
+
+// -----
+
+#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
+ #hal.descriptor_set.layout<0, bindings = [
+ #hal.descriptor_set.binding<0, storage_buffer>,
+ #hal.descriptor_set.binding<1, storage_buffer>,
+ #hal.descriptor_set.binding<2, storage_buffer>
+ ]>
+]>
+hal.executable private @matmul_aarch_i8_i8_i32 {
+ hal.executable.variant public @system_elf_arm_64, target = <"llvm", "system-elf-arm_64", {
+ data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128",
+ native_vector_size = 16 : index,
+ target_triple = "aarch64-none-linux-android30"
+ }> {
+ hal.executable.entry_point public @matmul_aarch_i8_i8_i32 layout(#executable_layout)
+ builtin.module {
+ func @matmul_aarch_i8_i8_i32() {
+ %c0 = arith.constant 0 : index
+ %M = hal.interface.constant.load[0] : index
+ %N = hal.interface.constant.load[1] : index
+ %K = hal.interface.constant.load[2] : index
+ %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(32)
+ : !flow.dispatch.tensor<readonly:?x?xi8>{%M, %K}
+ %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(32)
+ : !flow.dispatch.tensor<readonly:?x?xi8>{%K, %N}
+ %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(32)
+ : !flow.dispatch.tensor<readwrite:?x?xi32>{%M, %N}
+ %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
+ : !flow.dispatch.tensor<readonly:?x?xi8>{%M, %K} -> tensor<?x?xi8>
+ %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1]
+ : !flow.dispatch.tensor<readonly:?x?xi8>{%K, %N} -> tensor<?x?xi8>
+ %init = flow.dispatch.tensor.load %result_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
+ : !flow.dispatch.tensor<readwrite:?x?xi32>{%M, %N} -> tensor<?x?xi32>
+ %gemm = linalg.matmul ins(%lhs, %rhs : tensor<?x?xi8>, tensor<?x?xi8>) outs(%init : tensor<?x?xi32>) -> tensor<?x?xi32>
+ flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
+ : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:?x?xi32>{%M, %N}
+ return
+ }
+ }
+ }
+}
+
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [16, 4, 64], [4, 4, 4]]>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUTileFuseAndVectorize>
+// CHECK: hal.executable.entry_point public @matmul_aarch_i8_i8_i32
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -1110,7 +1160,7 @@
%5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:16x49xf32> -> tensor<16x49xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:33x49xf32> -> tensor<33x49xf32>
%7 = linalg.init_tensor [33, 49] : tensor<33x49xf32>
- %8 = linalg.fill(%cst, %7) : f32, tensor<33x49xf32> -> tensor<33x49xf32>
+ %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<33x49xf32>) -> tensor<33x49xf32>
%9 = linalg.matmul ins(%4, %5 : tensor<33x16xf32>, tensor<16x49xf32>) outs(%8 : tensor<33x49xf32>) -> tensor<33x49xf32>
flow.dispatch.tensor.store %9, %3, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : tensor<33x49xf32> -> !flow.dispatch.tensor<writeonly:33x49xf32>
return
@@ -1118,7 +1168,7 @@
}
}
}
-// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[3, 7, 0], [3, 7, 0], [0, 0, 16]]>
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[11, 7, 0], [1, 7, 0], [0, 0, 16]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.entry_point public @matmul_odd
// CHECK-SAME: translation_info = #[[TRANSLATION]]
diff --git a/iree/compiler/Codegen/LLVMCPU/test/tile_fuse_and_vectorize.mlir b/iree/compiler/Codegen/LLVMCPU/test/tile_fuse_and_vectorize.mlir
index a9175c3..44ce48f 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/tile_fuse_and_vectorize.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/tile_fuse_and_vectorize.mlir
@@ -26,7 +26,7 @@
%7 = linalg.init_tensor [64, 64] : tensor<64x64xf32>
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [64, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:384x512xf32> -> tensor<64x512xf32>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [512, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:512x128xf32> -> tensor<512x64xf32>
- %10 = linalg.fill(%cst, %7) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
+ %10 = linalg.fill ins(%cst : f32) outs(%7 : tensor<64x64xf32>) -> tensor<64x64xf32>
%11 = linalg.matmul {lowering_config = #config} ins(%8, %9 : tensor<64x512xf32>, tensor<512x64xf32>) outs(%10 : tensor<64x64xf32>) -> tensor<64x64xf32>
%12 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<64x64xf32>) outs(%7 : tensor<64x64xf32>) {
^bb0(%arg2: f32, %arg3: f32): // no predecessors
@@ -109,7 +109,7 @@
%14 = flow.dispatch.tensor.load %2, offsets = [%arg0, 0], sizes = [64, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:384x384xf32> -> tensor<64x384xf32>
%15 = flow.dispatch.tensor.load %3, offsets = [0, %arg1], sizes = [384, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:384x512xf32> -> tensor<384x64xf32>
%16 = linalg.init_tensor [64, 64] : tensor<64x64xf32>
- %17 = linalg.fill(%cst, %16) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
+ %17 = linalg.fill ins(%cst : f32) outs(%16 : tensor<64x64xf32>) -> tensor<64x64xf32>
%18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[], [32, 32, 32], [16, 16, 16]], native_vector_size = [16, 16, 16]>} ins(%14, %15 : tensor<64x384xf32>, tensor<384x64xf32>) outs(%17 : tensor<64x64xf32>) -> tensor<64x64xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%18, %11, %12 : tensor<64x64xf32>, tensor<64xi32>, tensor<64x64xf32>) outs(%13 : tensor<64x64xf32>) {
^bb0(%arg2: f32, %arg3: i32, %arg4: f32, %arg5: f32): // no predecessors
@@ -178,7 +178,7 @@
%16 = flow.dispatch.tensor.load %4, offsets = [%arg0, 0], sizes = [16, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:784x24xf32> -> tensor<16x24xf32>
%17 = flow.dispatch.tensor.load %5, offsets = [0, %arg1], sizes = [24, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:24x96xf32> -> tensor<24x49xf32>
%18 = linalg.init_tensor [16, 49] : tensor<16x49xf32>
- %19 = linalg.fill(%cst, %18) : f32, tensor<16x49xf32> -> tensor<16x49xf32>
+ %19 = linalg.fill ins(%cst : f32) outs(%18 : tensor<16x49xf32>) -> tensor<16x49xf32>
%20 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[], [16, 16, 32], [16, 16, 16]], native_vector_size = [16, 16, 16]>} ins(%16, %17 : tensor<16x24xf32>, tensor<24x49xf32>) outs(%19 : tensor<16x49xf32>) -> tensor<16x49xf32>
%21 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%20, %11, %12, %13, %14 : tensor<16x49xf32>, tensor<49xf32>, tensor<49xf32>, tensor<49xf32>, tensor<49xf32>) outs(%15 : tensor<16x49xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
diff --git a/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index 8f579b6..9cd86f6 100644
--- a/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -167,13 +167,8 @@
// FFT doesn't support second level of tiling yet.
return success(!isa<IREE::LinalgExt::FftOp>(op));
}).setMatchByDefault();
- linalg::TilingPatterns<
- linalg::MatmulOp, linalg::FillOp, linalg::BatchMatmulOp,
- linalg::GenericOp, linalg::Conv2DNhwcHwcfOp,
- linalg::DepthwiseConv2DNhwcHwcOp, linalg::DepthwiseConv2DNhwcHwcmOp,
- linalg::PoolingNhwcMaxOp, linalg::PoolingNhwcMinOp,
- linalg::PoolingNhwcSumOp>::insert(patterns, tilingOptions, f);
- patterns.insert<IREE::LinalgExt::TiledOpInterfaceTilingPattern>(
+ patterns.insert<linalg::LinalgTilingPattern,
+ IREE::LinalgExt::TiledOpInterfaceTilingPattern>(
context, tilingOptions, f);
}
diff --git a/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir b/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
index aba5248..7004ce8 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
@@ -47,8 +47,7 @@
: memref<1024x1024xf32> to memref<1024x256xf32, #map4>
%11 = memref.subview %2[%arg0, %arg1] [2, 256] [1, 1]
: memref<1024x1024xf32> to memref<2x256xf32, #map4>
- linalg.fill(%cst, %11) {lowering_config = #config}
- : f32, memref<2x256xf32, #map4>
+ linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%11 : memref<2x256xf32, #map4>)
linalg.matmul {lowering_config = #config}
ins(%8, %10 : memref<2x1024xf32, #map4>, memref<1024x256xf32, #map4>)
outs(%11 : memref<2x256xf32, #map4>)
@@ -130,7 +129,7 @@
%7 = memref.subview %0[%arg0, %arg1, 0] [1, 8, 1024] [1, 1, 1] : memref<4x32x1024xf32> to memref<1x8x1024xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>>
%8 = memref.subview %1[%arg0, 0, %arg2] [1, 1024, 32] [1, 1, 1] : memref<4x1024x64xf32> to memref<1x1024x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>>
%9 = memref.subview %2[%arg0, %arg1, %arg2] [1, 8, 32] [1, 1, 1] : memref<4x32x64xf32> to memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>
- linalg.fill(%cst, %9) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 8, 32, 32]]>} : f32, memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>
+ linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 8, 32, 32]]>} ins(%cst : f32) outs(%9 : memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>)
linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 8, 32, 32]]>} ins(%7, %8 : memref<1x8x1024xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>>, memref<1x1024x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>>) outs(%9 : memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>)
}
}
@@ -207,8 +206,7 @@
: memref<1024x1024xf32> to memref<1024x32xf32, #map4>
%11 = memref.subview %2[%arg0, %arg1] [2, 32] [1, 1]
: memref<1024x1024xf32> to memref<2x32xf32, #map4>
- linalg.fill(%cst, %11) {lowering_config = #config}
- : f32, memref<2x32xf32, #map4>
+ linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%11 : memref<2x32xf32, #map4>)
linalg.matmul {lowering_config = #config}
ins(%8, %10 : memref<2x1024xf32, #map4>, memref<1024x32xf32, #map4>)
outs(%11 : memref<2x32xf32, #map4>)
@@ -271,7 +269,7 @@
%cst_0 = arith.constant 0xFF800000 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1000xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<f32>
- linalg.fill(%cst_0, %1) {lowering_config = #config} : f32, memref<f32>
+ linalg.fill {lowering_config = #config} ins(%cst_0 : f32) outs(%1 : memref<f32>)
linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%0 : memref<1000xf32>) outs(%1 : memref<f32>) attrs = {lowering_config = #config} {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%2 = arith.cmpf ogt, %arg0, %arg1 : f32
@@ -293,3 +291,70 @@
// CHECK: linalg.generic
// CHECK-SAME: ins(%{{.*}} : memref<1000xf32>) outs(%{{.*}} : memref<f32>)
// CHECK-SAME: lowering_config = #[[CONFIG]]
+
+// -----
+
+#translation = #iree_codegen.translation_info<LLVMGPUVectorize, workload_per_wg = [256, 1, 1]>
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
+ #hal.descriptor_set.layout<0, bindings = [
+ #hal.descriptor_set.binding<0, storage_buffer>,
+ #hal.descriptor_set.binding<1, storage_buffer>,
+ #hal.descriptor_set.binding<2, storage_buffer>
+ ]>
+]>
+hal.executable private @conv_dispatch {
+ hal.executable.variant @cuda, target = #executable_target_cuda_nvptx_fb {
+ hal.executable.entry_point @conv_dispatch layout(#executable_layout) {
+ translation_info = #translation,
+ workgroup_size = [64 : index, 1 : index, 1 : index]
+ }
+ builtin.module {
+ func @conv_dispatch() {
+ %c56 = arith.constant 56 : index
+ %c64 = arith.constant 64 : index
+ %c802816 = arith.constant 802816 : index
+ %c41664 = arith.constant 41664 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<1x64x56x56xf32>
+ memref.assume_alignment %0, 64 : memref<1x64x56x56xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c41664) alignment(64) : memref<64x64x1x1xf32>
+ memref.assume_alignment %1, 64 : memref<64x64x1x1xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c802816) alignment(64) : memref<1x64x56x56xf32>
+ memref.assume_alignment %2, 64 : memref<1x64x56x56xf32>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %workgroup_id_z = hal.interface.workgroup.id[2] : index
+ %workgroup_count_z = hal.interface.workgroup.count[2] : index
+ scf.for %arg0 = %workgroup_id_z to %c64 step %workgroup_count_z {
+ scf.for %arg1 = %workgroup_id_y to %c56 step %workgroup_count_y {
+ %3 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_id_x]
+ %4 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_count_x]
+ scf.for %arg2 = %3 to %c56 step %4 {
+ %5 = affine.min affine_map<(d0) -> (256, -d0 + 56)>(%arg2)
+ %6 = memref.subview %0[0, 0, %arg1, %arg2] [1, 64, 1, %5] [1, 1, 1, 1] : memref<1x64x56x56xf32> to memref<1x64x1x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 200704 + s0 + d1 * 3136 + d2 * 56 + d3)>>
+ %7 = memref.subview %1[%arg0, 0, 0, 0] [1, 64, 1, 1] [1, 1, 1, 1] : memref<64x64x1x1xf32> to memref<1x64x1x1xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 64 + s0 + d1 + d2 + d3)>>
+ %8 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, 1, 1, %5] [1, 1, 1, 1] : memref<1x64x56x56xf32> to memref<1x1x1x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 200704 + s0 + d1 * 3136 + d2 * 56 + d3)>>
+ linalg.fill{lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 256, 4, 4, 4]]>} ins(%cst : f32) outs(%8 : memref<1x1x1x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 200704 + s0 + d1 * 3136 + d2 * 56 + d3)>>)
+ linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 256, 4, 4, 4]]>, strides = dense<1> : vector<2xi64>} ins(%6, %7 : memref<1x64x1x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 200704 + s0 + d1 * 3136 + d2 * 56 + d3)>>, memref<1x64x1x1xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 64 + s0 + d1 + d2 + d3)>>) outs(%8 : memref<1x1x1x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 200704 + s0 + d1 * 3136 + d2 * 56 + d3)>>)
+ }
+ }
+ }
+ return
+ }
+ }
+ }
+}
+
+// Check that the convolution is distributed.
+// CHECK-LABEL: func @conv_dispatch
+// CHECK: scf.for
+// CHECK: scf.for
+// CHECK: scf.for
+// CHECK: scf.for
+// CHECK: linalg.fill
+// CHECK: scf.for
+// CHECK: linalg.conv_2d_nchw_fchw
diff --git a/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
index 62ea000..25370ff 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
@@ -61,9 +61,9 @@
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2x3xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<3x4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<2x4xf32>
- linalg.fill(%cst, %2) : f32, memref<2x4xf32>
- linalg.matmul ins(%0, %1 : memref<2x3xf32>, memref<3x4xf32>) outs(%2 : memref<2x4xf32>)
- return
+ linalg.fill ins(%cst : f32) outs(%2 : memref<2x4xf32>)
+ linalg.matmul ins(%0, %1 : memref<2x3xf32>, memref<3x4xf32>) outs(%2 : memref<2x4xf32>)
+ return
}
}
}
@@ -98,7 +98,7 @@
%cst_0 = arith.constant 0xFF800000 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1000xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<f32>
- linalg.fill(%cst_0, %1) : f32, memref<f32>
+ linalg.fill ins(%cst_0 : f32) outs(%1 : memref<f32>)
linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%0 : memref<1000xf32>) outs(%1 : memref<f32>) {
^bb0(%arg0: f32, %arg1: f32): // no predecessors
%2 = arith.cmpf ogt, %arg0, %arg1 : f32
@@ -309,7 +309,7 @@
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
: !flow.dispatch.tensor<readonly:256x1024xf32> -> tensor<256x1024xf32>
%15 = linalg.init_tensor [128, 1024] : tensor<128x1024xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<128x1024xf32> -> tensor<128x1024xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup", compilation_info = #compilation}
ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%16 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor<writeonly:128x1024xf32>
diff --git a/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir b/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
index 59735b5..58e9859 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
@@ -301,7 +301,7 @@
%7 = memref.subview %0[%arg0, %arg1, 0] [1, 8, 1024] [1, 1, 1] : memref<4x32x1024xf32> to memref<1x8x1024xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>>
%8 = memref.subview %1[%arg0, 0, %arg2] [1, 1024, 32] [1, 1, 1] : memref<4x1024x64xf32> to memref<1x1024x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>>
%9 = memref.subview %2[%arg0, %arg1, %arg2] [1, 8, 32] [1, 1, 1] : memref<4x32x64xf32> to memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>
- linalg.fill(%cst, %9) {lowering_config = #config} : f32, memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>
+ linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%9 : memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>)
// expected-error @+1 {{Received first tile dimension of 2 instead of 0 for LLVMGPUMatmulTensorCore}}
linalg.batch_matmul {lowering_config = #config} ins(%7, %8 : memref<1x8x1024xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>>, memref<1x1024x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>>) outs(%9 : memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>)
}
diff --git a/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir b/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir
index 2379836..0e2cf86 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir
@@ -68,7 +68,7 @@
%10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1]
: !flow.dispatch.tensor<readonly:1024x1024xf32> -> tensor<1024x1024xf32>
%15 = linalg.init_tensor [1024, 1024] : tensor<1024x1024xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<1024x1024xf32> -> tensor<1024x1024xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%17 = linalg.matmul ins(%8, %10 : tensor<1024x1024xf32>, tensor<1024x1024xf32>)
outs(%16 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1]
@@ -135,7 +135,7 @@
%10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1]
: !flow.dispatch.tensor<readonly:1024x1024xf32> -> tensor<1024x1024xf32>
%15 = linalg.init_tensor [1024, 1024] : tensor<1024x1024xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<1024x1024xf32> -> tensor<1024x1024xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%17 = linalg.generic #matmul_trait
ins(%8, %10 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%16 : tensor<1024x1024xf32>) {
^bb(%a: f32, %b: f32, %c: f32) :
@@ -185,7 +185,7 @@
%13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 2, 2, 1], strides = [1, 1, 1, 1]
: !flow.dispatch.tensor<readonly:3x2x2x1xf32> -> tensor<3x2x2x1xf32>
%20 = linalg.init_tensor [1, 2, 3, 1] : tensor<1x2x3x1xf32>
- %21 = linalg.fill(%cst, %20) : f32, tensor<1x2x3x1xf32> -> tensor<1x2x3x1xf32>
+ %21 = linalg.fill ins(%cst : f32) outs(%20 : tensor<1x2x3x1xf32>) -> tensor<1x2x3x1xf32>
%22 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
ins(%11, %13 : tensor<1x4x4x2xf32>, tensor<3x2x2x1xf32>) outs(%21 : tensor<1x2x3x1xf32>) -> tensor<1x2x3x1xf32>
flow.dispatch.tensor.store %22, %2, offsets = [0, 0, 0, 0], sizes = [1, 2, 3, 1], strides = [1, 1, 1, 1]
@@ -260,7 +260,7 @@
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [14, 14, 96], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:14x14x96xf32> -> tensor<14x14x96xf32>
%8 = linalg.init_tensor [96] : tensor<96xf32>
- %9 = linalg.fill(%cst, %8) : f32, tensor<96xf32> -> tensor<96xf32>
+ %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<96xf32>) -> tensor<96xf32>
%10 = linalg.generic {
indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2, d0)>, affine_map<(d0, d1, d2) -> (d0)>],
iterator_types = ["parallel", "reduction", "reduction"]}
@@ -352,7 +352,7 @@
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 16384], strides = [1, 1]
: !flow.dispatch.tensor<readonly:512x16384xf32> -> tensor<512x16384xf32>
%8 = linalg.init_tensor [16384] : tensor<16384xf32>
- %9 = linalg.fill(%cst, %8) : f32, tensor<16384xf32> -> tensor<16384xf32>
+ %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<16384xf32>) -> tensor<16384xf32>
%10 = linalg.generic {
indexing_maps = [#map3, #map4], iterator_types = ["parallel", "reduction"]}
ins(%5 : tensor<512x16384xf32>) outs(%9 : tensor<16384xf32>) {
@@ -402,7 +402,7 @@
%d = flow.dispatch.tensor.load %di, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1]
: !flow.dispatch.tensor<readonly:2048x512xf32> -> tensor<2048x512xf32>
%init = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
- %f = linalg.fill(%cst, %init) : f32, tensor<2048x512xf32> -> tensor<2048x512xf32>
+ %f = linalg.fill ins(%cst : f32) outs(%init : tensor<2048x512xf32>) -> tensor<2048x512xf32>
%m = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%f : tensor<2048x512xf32>) -> tensor<2048x512xf32>
%init2 = linalg.init_tensor [2048, 512] : tensor<2048x512xf32>
%a = linalg.generic {
@@ -511,7 +511,7 @@
%13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 1024, 64], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:4x1024x64xf32> -> tensor<4x1024x64xf32>
%17 = linalg.init_tensor [4, 32, 64] : tensor<4x32x64xf32>
- %18 = linalg.fill(%cst, %17) : f32, tensor<4x32x64xf32> -> tensor<4x32x64xf32>
+ %18 = linalg.fill ins(%cst : f32) outs(%17 : tensor<4x32x64xf32>) -> tensor<4x32x64xf32>
%19 = linalg.batch_matmul ins(%11, %13 : tensor<4x32x1024xf32>, tensor<4x1024x64xf32>)
outs(%18 : tensor<4x32x64xf32>) -> tensor<4x32x64xf32>
flow.dispatch.tensor.store %19, %2, offsets = [0, 0, 0], sizes = [4, 32, 64], strides = [1, 1, 1]
diff --git a/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir b/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
index ad61567..3badaf2 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
@@ -67,7 +67,7 @@
%10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1]
: !flow.dispatch.tensor<readonly:1024x1024xf32> -> tensor<1024x1024xf32>
%15 = linalg.init_tensor [1024, 1024] : tensor<1024x1024xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<1024x1024xf32> -> tensor<1024x1024xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
%17 = linalg.matmul ins(%8, %10 : tensor<1024x1024xf32>, tensor<1024x1024xf32>)
outs(%16 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1]
diff --git a/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir b/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir
index 3acdb83..eac1c9e 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir
@@ -20,7 +20,7 @@
%10 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%9]
%11 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 32)>(%8)
%12 = memref.subview %7[%10, %11] [32, 32] [1, 1] : memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
- linalg.fill(%cst, %12) {__internal_linalg_transform__ = "vectorize"} : f32, memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
+ linalg.fill {__internal_linalg_transform__ = "vectorize"} ins(%cst : f32) outs(%12 : memref<32x32xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>)
scf.for %arg0 = %c0 to %c1024 step %c16 {
%13 = memref.subview %5[0, %arg0] [64, 16] [1, 1] : memref<64x1024xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<64x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%14 = memref.subview %6[%arg0, 0] [16, 64] [1, 1] : memref<1024x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>> to memref<16x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 512 + s0 + d1)>>
diff --git a/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir b/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir
index c13e167..5d43eeb 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir
@@ -32,7 +32,7 @@
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 512], strides = [1, 1, 1, 1]
: !flow.dispatch.tensor<readonly:3x3x3x512xf32> -> tensor<3x3x3x512xf32>
%22 = linalg.init_tensor [1, 112, 112, 512] : tensor<1x112x112x512xf32>
- %23 = linalg.fill(%cst, %22) : f32, tensor<1x112x112x512xf32> -> tensor<1x112x112x512xf32>
+ %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%13, %15 : tensor<1x225x225x3xf32>, tensor<3x3x3x512xf32>) outs(%23 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 512], strides = [1, 1, 1, 1]
@@ -86,7 +86,7 @@
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1]
: !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%22 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
- %23 = linalg.fill(%cst, %22) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+ %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%13, %15 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%23 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1]
@@ -139,7 +139,7 @@
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1]
: !flow.dispatch.tensor<readonly:3x3x3x16xf32> -> tensor<3x3x3x16xf32>
%22 = linalg.init_tensor [1, 16, 16, 16] : tensor<1x16x16x16xf32>
- %23 = linalg.fill(%cst, %22) : f32, tensor<1x16x16x16xf32> -> tensor<1x16x16x16xf32>
+ %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%13, %15 : tensor<1x33x33x3xf32>, tensor<3x3x3x16xf32>) outs(%23 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 16, 16, 16], strides = [1, 1, 1, 1]
@@ -194,7 +194,7 @@
%16 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 144], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:3x3x144xf32> -> tensor<3x3x144xf32>
%23 = linalg.init_tensor [1, 28, 28, 144] : tensor<1x28x28x144xf32>
- %24 = linalg.fill(%cst, %23) : f32, tensor<1x28x28x144xf32> -> tensor<1x28x28x144xf32>
+ %24 = linalg.fill ins(%cst : f32) outs(%23 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
%25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%14, %16 : tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) outs(%24 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
flow.dispatch.tensor.store %25, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 144], strides = [1, 1, 1, 1]
@@ -248,7 +248,7 @@
%16 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 8], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:3x3x8xf32> -> tensor<3x3x8xf32>
%23 = linalg.init_tensor [1, 4, 4, 8] : tensor<1x4x4x8xf32>
- %24 = linalg.fill(%cst, %23) : f32, tensor<1x4x4x8xf32> -> tensor<1x4x4x8xf32>
+ %24 = linalg.fill ins(%cst : f32) outs(%23 : tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
%25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%14, %16 : tensor<1x9x9x8xf32>, tensor<3x3x8xf32>) outs(%24 : tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
flow.dispatch.tensor.store %25, %2, offsets = [0, 0, 0, 0], sizes = [1, 4, 4, 8], strides = [1, 1, 1, 1]
diff --git a/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir
index a3a08a9..59d7b3b 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir
@@ -32,7 +32,7 @@
%10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1]
: !flow.dispatch.tensor<readonly:512x2048xf32> -> tensor<512x2048xf32>
%15 = linalg.init_tensor [1024, 2048] : tensor<1024x2048xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<1024x2048xf32> -> tensor<1024x2048xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
ins(%8, %10 : tensor<1024x512xf32>, tensor<512x2048xf32>) outs(%16 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1]
@@ -86,7 +86,7 @@
%10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [96, 24], strides = [1, 1]
: !flow.dispatch.tensor<readonly:96x24xf32> -> tensor<96x24xf32>
%15 = linalg.init_tensor [3136, 24] : tensor<3136x24xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<3136x24xf32> -> tensor<3136x24xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
ins(%8, %10 : tensor<3136x96xf32>, tensor<96x24xf32>) outs(%16 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [3136, 24], strides = [1, 1]
@@ -140,7 +140,7 @@
%10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [192, 64], strides = [1, 1]
: !flow.dispatch.tensor<readonly:192x64xf32> -> tensor<192x64xf32>
%15 = linalg.init_tensor [196, 64] : tensor<196x64xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<196x64xf32> -> tensor<196x64xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<196x64xf32>) -> tensor<196x64xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
ins(%8, %10 : tensor<196x192xf32>, tensor<192x64xf32>) outs(%16 : tensor<196x64xf32>) -> tensor<196x64xf32>
flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [196, 64], strides = [1, 1]
@@ -189,7 +189,7 @@
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12544x16xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x96xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<12544x96xf32>
- linalg.fill(%cst, %2) : f32, memref<12544x96xf32>
+ linalg.fill ins(%cst : f32) outs(%2 : memref<12544x96xf32>)
linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%0, %1 : memref<12544x16xf32>, memref<16x96xf32>) outs(%2 : memref<12544x96xf32>)
return
}
@@ -238,7 +238,7 @@
%8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [49, 576], strides = [1, 1] : !flow.dispatch.tensor<readonly:49x576xf32> -> tensor<49x576xf32>
%10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 160], strides = [1, 1] : !flow.dispatch.tensor<readonly:576x160xf32> -> tensor<576x160xf32>
%15 = linalg.init_tensor [49, 160] : tensor<49x160xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<49x160xf32> -> tensor<49x160xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<49x160xf32>) -> tensor<49x160xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
ins(%8, %10 : tensor<49x576xf32>, tensor<576x160xf32>) outs(%16 : tensor<49x160xf32>) -> tensor<49x160xf32>
flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [49, 160], strides = [1, 1]
@@ -292,7 +292,7 @@
%14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 384], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:4x32x384xf32> -> tensor<4x32x384xf32>
%21 = linalg.init_tensor [4, 384, 384] : tensor<4x384x384xf32>
- %22 = linalg.fill(%cst, %21) : f32, tensor<4x384x384xf32> -> tensor<4x384x384xf32>
+ %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
%23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"}
ins(%11, %14 : tensor<4x384x32xf32>, tensor<4x32x384xf32>) outs(%22 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
flow.dispatch.tensor.store %23, %2, offsets = [0, 0, 0], sizes = [4, 384, 384], strides = [1, 1, 1]
@@ -346,7 +346,7 @@
%14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 8], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:4x32x8xf32> -> tensor<4x32x8xf32>
%21 = linalg.init_tensor [4, 8, 8] : tensor<4x8x8xf32>
- %22 = linalg.fill(%cst, %21) : f32, tensor<4x8x8xf32> -> tensor<4x8x8xf32>
+ %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<4x8x8xf32>) -> tensor<4x8x8xf32>
%23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"}
ins(%11, %14 : tensor<4x8x32xf32>, tensor<4x32x8xf32>) outs(%22 : tensor<4x8x8xf32>) -> tensor<4x8x8xf32>
flow.dispatch.tensor.store %23, %2, offsets = [0, 0, 0], sizes = [4, 8, 8], strides = [1, 1, 1]
diff --git a/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir b/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir
index 74373dc..42ee057 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir
@@ -45,7 +45,7 @@
%21 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1]
: !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%24 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
- %25 = linalg.fill(%cst, %24) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+ %25 = linalg.fill ins(%cst : f32) outs(%24 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%26 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%19, %21 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%25 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
diff --git a/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir b/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir
index 7db31c6..dd29936 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir
@@ -117,7 +117,7 @@
%14 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 24, 24, 8], strides = [1, 1, 1, 1]
: !flow.dispatch.tensor<readonly:1x24x24x8xf32> -> tensor<1x24x24x8xf32>
%20 = linalg.init_tensor [1, 2, 2, 8] : tensor<1x2x2x8xf32>
- %21 = linalg.fill(%cst, %20) : f32, tensor<1x2x2x8xf32> -> tensor<1x2x2x8xf32>
+ %21 = linalg.fill ins(%cst : f32) outs(%20 : tensor<1x2x2x8xf32>) -> tensor<1x2x2x8xf32>
%22 = linalg.pooling_nhwc_sum {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : vector<2xi64>, strides = dense<12> : vector<2xi64>}
ins(%14, %2 : tensor<1x24x24x8xf32>, tensor<12x12xf32>)
outs(%21 : tensor<1x2x2x8xf32>) -> tensor<1x2x2x8xf32>
@@ -168,7 +168,7 @@
%13 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 76, 1, 1], strides = [1, 1, 1, 1]
: !flow.dispatch.tensor<readonly:1x76x1x1xf32> -> tensor<1x76x1x1xf32>
%18 = linalg.init_tensor [1, 38, 1, 1] : tensor<1x38x1x1xf32>
- %19 = linalg.fill(%cst, %18) : f32, tensor<1x38x1x1xf32> -> tensor<1x38x1x1xf32>
+ %19 = linalg.fill ins(%cst : f32) outs(%18 : tensor<1x38x1x1xf32>) -> tensor<1x38x1x1xf32>
%20 = linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<[2, 1]> : vector<2xi64>}
ins(%13, %2 : tensor<1x76x1x1xf32>, tensor<2x1xf32>)
outs(%19 : tensor<1x38x1x1xf32>) -> tensor<1x38x1x1xf32>
@@ -278,7 +278,7 @@
%14 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 21, 20, 1], strides = [1, 1, 1, 1]
: !flow.dispatch.tensor<readonly:1x21x20x1xf32> -> tensor<1x21x20x1xf32>
%18 = linalg.init_tensor [1, 19, 18, 1, 4] : tensor<1x19x18x1x4xf32>
- %19 = linalg.fill(%cst_9, %18) : f32, tensor<1x19x18x1x4xf32> -> tensor<1x19x18x1x4xf32>
+ %19 = linalg.fill ins(%cst_9 : f32) outs(%18 : tensor<1x19x18x1x4xf32>) -> tensor<1x19x18x1x4xf32>
%20 = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
ins(%14, %cst : tensor<1x21x20x1xf32>, tensor<3x3x1x4xf32>) outs(%19 : tensor<1x19x18x1x4xf32>) -> tensor<1x19x18x1x4xf32>
%21 = linalg.generic {
diff --git a/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir
index 1887d47..f8b6fea 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir
@@ -33,7 +33,7 @@
%14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 3, 32], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:1x3x32xf32> -> tensor<1x3x32xf32>
%21 = linalg.init_tensor [1, 3, 32] : tensor<1x3x32xf32>
- %22 = linalg.fill(%cst, %21) : f32, tensor<1x3x32xf32> -> tensor<1x3x32xf32>
+ %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<1x3x32xf32>) -> tensor<1x3x32xf32>
%23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"}
ins(%11, %14 : tensor<1x3x3xf32>, tensor<1x3x32xf32>) outs(%22 : tensor<1x3x32xf32>) -> tensor<1x3x32xf32>
flow.dispatch.tensor.store %23, %2, offsets = [0, 0, 0], sizes = [1, 3, 32], strides = [1, 1, 1]
@@ -87,7 +87,7 @@
%10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 16], strides = [1, 1]
: !flow.dispatch.tensor<readonly:32x16xi8> -> tensor<32x16xi8>
%15 = linalg.init_tensor [64, 16] : tensor<64x16xi32>
- %16 = linalg.fill(%c0_i32, %15) : i32, tensor<64x16xi32> -> tensor<64x16xi32>
+ %16 = linalg.fill ins(%c0_i32 : i32) outs(%15 : tensor<64x16xi32>) -> tensor<64x16xi32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
ins(%8, %10 : tensor<64x32xi8>, tensor<32x16xi8>) outs(%16 : tensor<64x16xi32>) -> tensor<64x16xi32>
flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [64, 16], strides = [1, 1]
@@ -145,7 +145,7 @@
%15 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [576, 273], strides = [1, 1]
: !flow.dispatch.tensor<readonly:576x273xf32> -> tensor<576x273xf32>
%16 = linalg.init_tensor [400, 273] : tensor<400x273xf32>
- %17 = linalg.fill(%cst, %16) : f32, tensor<400x273xf32> -> tensor<400x273xf32>
+ %17 = linalg.fill ins(%cst : f32) outs(%16 : tensor<400x273xf32>) -> tensor<400x273xf32>
%18 = linalg.matmul ins(%13, %15 : tensor<400x576xf32>, tensor<576x273xf32>) outs(%17 : tensor<400x273xf32>) -> tensor<400x273xf32>
%19 = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
@@ -211,7 +211,7 @@
%15 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [512, 546], strides = [1, 1]
: !flow.dispatch.tensor<readonly:512x546xf32> -> tensor<512x546xf32>
%16 = linalg.init_tensor [25, 546] : tensor<25x546xf32>
- %17 = linalg.fill(%cst, %16) : f32, tensor<25x546xf32> -> tensor<25x546xf32>
+ %17 = linalg.fill ins(%cst : f32) outs(%16 : tensor<25x546xf32>) -> tensor<25x546xf32>
%18 = linalg.matmul ins(%13, %15 : tensor<25x512xf32>, tensor<512x546xf32>) outs(%17 : tensor<25x546xf32>) -> tensor<25x546xf32>
%19 = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
@@ -287,7 +287,7 @@
%17 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1]
: !flow.dispatch.tensor<readonly:128x1024xf16> -> tensor<128x1024xf16>
%18 = linalg.init_tensor [256, 1024] : tensor<256x1024xf16>
- %19 = linalg.fill(%cst, %18) : f16, tensor<256x1024xf16> -> tensor<256x1024xf16>
+ %19 = linalg.fill ins(%cst : f16) outs(%18 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
%20 = linalg.matmul ins(%15, %17 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%19 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
%21 = linalg.generic {
indexing_maps = [#map5, #map5, #map5, #map5], iterator_types = ["parallel", "parallel"]}
diff --git a/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir b/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir
index 17257ae..75dbf88 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir
@@ -32,7 +32,7 @@
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 512], strides = [1, 1, 1, 1]
: !flow.dispatch.tensor<readonly:3x3x3x512xf32> -> tensor<3x3x3x512xf32>
%22 = linalg.init_tensor [1, 112, 112, 512] : tensor<1x112x112x512xf32>
- %23 = linalg.fill(%cst, %22) : f32, tensor<1x112x112x512xf32> -> tensor<1x112x112x512xf32>
+ %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%13, %15 : tensor<1x225x225x3xf32>, tensor<3x3x3x512xf32>)
outs(%23 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
@@ -87,7 +87,7 @@
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1]
: !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
%22 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
- %23 = linalg.fill(%cst, %22) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+ %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%13, %15 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%23 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1]
@@ -140,7 +140,7 @@
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1]
: !flow.dispatch.tensor<readonly:3x3x3x16xf32> -> tensor<3x3x3x16xf32>
%22 = linalg.init_tensor [1, 16, 16, 16] : tensor<1x16x16x16xf32>
- %23 = linalg.fill(%cst, %22) : f32, tensor<1x16x16x16xf32> -> tensor<1x16x16x16xf32>
+ %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
%24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%13, %15 : tensor<1x33x33x3xf32>, tensor<3x3x3x16xf32>) outs(%23 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 16, 16, 16], strides = [1, 1, 1, 1]
@@ -194,7 +194,7 @@
%16 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 144], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:3x3x144xf32> -> tensor<3x3x144xf32>
%23 = linalg.init_tensor [1, 28, 28, 144] : tensor<1x28x28x144xf32>
- %24 = linalg.fill(%cst, %23) : f32, tensor<1x28x28x144xf32> -> tensor<1x28x28x144xf32>
+ %24 = linalg.fill ins(%cst : f32) outs(%23 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
%25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%14, %16 : tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) outs(%24 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
flow.dispatch.tensor.store %25, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 144], strides = [1, 1, 1, 1]
@@ -249,7 +249,7 @@
%16 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 8], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:3x3x8xf32> -> tensor<3x3x8xf32>
%23 = linalg.init_tensor [1, 1, 2, 8] : tensor<1x1x2x8xf32>
- %24 = linalg.fill(%cst, %23) : f32, tensor<1x1x2x8xf32> -> tensor<1x1x2x8xf32>
+ %24 = linalg.fill ins(%cst : f32) outs(%23 : tensor<1x1x2x8xf32>) -> tensor<1x1x2x8xf32>
%25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%14, %16 : tensor<1x3x5x8xf32>, tensor<3x3x8xf32>) outs(%24 : tensor<1x1x2x8xf32>) -> tensor<1x1x2x8xf32>
flow.dispatch.tensor.store %25, %2, offsets = [0, 0, 0, 0], sizes = [1, 1, 2, 8], strides = [1, 1, 1, 1]
diff --git a/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir
index d46e6be..ebddd9f 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir
@@ -30,7 +30,7 @@
%8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x512xf32>
%10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:512x2048xf32> -> tensor<512x2048xf32>
%15 = linalg.init_tensor [1024, 2048] : tensor<1024x2048xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<1024x2048xf32> -> tensor<1024x2048xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
ins(%8, %10 : tensor<1024x512xf32>, tensor<512x2048xf32>) outs(%16 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1]
@@ -82,7 +82,7 @@
%8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3136, 96], strides = [1, 1] : !flow.dispatch.tensor<readonly:3136x96xf32> -> tensor<3136x96xf32>
%10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [96, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:96x24xf32> -> tensor<96x24xf32>
%15 = linalg.init_tensor [3136, 24] : tensor<3136x24xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<3136x24xf32> -> tensor<3136x24xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
ins(%8, %10 : tensor<3136x96xf32>, tensor<96x24xf32>)
outs(%16 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
@@ -137,7 +137,7 @@
%10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [192, 64], strides = [1, 1]
: !flow.dispatch.tensor<readonly:192x64xf32> -> tensor<192x64xf32>
%15 = linalg.init_tensor [196, 64] : tensor<196x64xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<196x64xf32> -> tensor<196x64xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<196x64xf32>) -> tensor<196x64xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
ins(%8, %10 : tensor<196x192xf32>, tensor<192x64xf32>) outs(%16 : tensor<196x64xf32>) -> tensor<196x64xf32>
flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [196, 64], strides = [1, 1]
@@ -186,7 +186,7 @@
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12544x16xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x96xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<12544x96xf32>
- linalg.fill(%cst, %2) : f32, memref<12544x96xf32>
+ linalg.fill ins(%cst : f32) outs(%2 : memref<12544x96xf32>)
linalg.matmul {__internal_linalg_transform__ = "workgroup"}
ins(%0, %1 : memref<12544x16xf32>, memref<16x96xf32>) outs(%2 : memref<12544x96xf32>)
return
@@ -238,7 +238,7 @@
%10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 160], strides = [1, 1]
: !flow.dispatch.tensor<readonly:576x160xf32> -> tensor<576x160xf32>
%15 = linalg.init_tensor [49, 160] : tensor<49x160xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<49x160xf32> -> tensor<49x160xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<49x160xf32>) -> tensor<49x160xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
ins(%8, %10 : tensor<49x576xf32>, tensor<576x160xf32>) outs(%16 : tensor<49x160xf32>) -> tensor<49x160xf32>
flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [49, 160], strides = [1, 1]
@@ -292,7 +292,7 @@
%14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 384], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:4x32x384xf32> -> tensor<4x32x384xf32>
%21 = linalg.init_tensor [4, 384, 384] : tensor<4x384x384xf32>
- %22 = linalg.fill(%cst, %21) : f32, tensor<4x384x384xf32> -> tensor<4x384x384xf32>
+ %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
%23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"}
ins(%11, %14 : tensor<4x384x32xf32>, tensor<4x32x384xf32>)
outs(%22 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
@@ -348,7 +348,7 @@
%14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 8], strides = [1, 1, 1]
: !flow.dispatch.tensor<readonly:4x32x8xf32> -> tensor<4x32x8xf32>
%21 = linalg.init_tensor [4, 2, 8] : tensor<4x2x8xf32>
- %22 = linalg.fill(%cst, %21) : f32, tensor<4x2x8xf32> -> tensor<4x2x8xf32>
+ %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<4x2x8xf32>) -> tensor<4x2x8xf32>
%23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"}
ins(%11, %14 : tensor<4x2x32xf32>, tensor<4x32x8xf32>) outs(%22 : tensor<4x2x8xf32>) -> tensor<4x2x8xf32>
flow.dispatch.tensor.store %23, %2, offsets = [0, 0, 0], sizes = [4, 2, 8], strides = [1, 1, 1]
diff --git a/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir b/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir
index d7b8532..ab22b21 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir
@@ -56,7 +56,7 @@
%21 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1]
: !flow.dispatch.tensor<readonly:128x1024xf16> -> tensor<128x1024xf16>
%24 = linalg.init_tensor [256, 1024] : tensor<256x1024xf16>
- %25 = linalg.fill(%cst, %24) : f16, tensor<256x1024xf16> -> tensor<256x1024xf16>
+ %25 = linalg.fill ins(%cst : f16) outs(%24 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
%26 = linalg.matmul ins(%19, %21 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%25 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
%27 = linalg.generic {
indexing_maps = [#map5, #map5, #map5, #map5], iterator_types = ["parallel", "parallel"]}
@@ -133,7 +133,7 @@
%8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 8], strides = [1, 1] : !flow.dispatch.tensor<readonly:256x8xf16> -> tensor<256x8xf16>
%10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:8x1024xf16> -> tensor<8x1024xf16>
%15 = linalg.init_tensor [256, 1024] : tensor<256x1024xf16>
- %16 = linalg.fill(%cst, %15) : f16, tensor<256x1024xf16> -> tensor<256x1024xf16>
+ %16 = linalg.fill ins(%cst : f16) outs(%15 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
ins(%8, %10 : tensor<256x8xf16>, tensor<8x1024xf16>) outs(%16 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
diff --git a/iree/compiler/Codegen/SPIRV/test/create_fast_slow_path.mlir b/iree/compiler/Codegen/SPIRV/test/create_fast_slow_path.mlir
index 008d001..a318d01 100644
--- a/iree/compiler/Codegen/SPIRV/test/create_fast_slow_path.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/create_fast_slow_path.mlir
@@ -38,7 +38,7 @@
tensor.yield %cst : f32
} : tensor<1x?x?x3xf32> to tensor<1x?x?x3xf32>
%20 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xf32> -> tensor<3x3x3x32xf32>
- %21 = linalg.fill(%cst, %9) {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} : f32, tensor<1x1x4x32xf32> -> tensor<1x1x4x32xf32>
+ %21 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} ins(%cst : f32) outs(%9 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%22 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>, strides = dense<2> : tensor<2xi64>} ins(%19, %20 : tensor<1x?x?x3xf32>, tensor<3x3x3x32xf32>) outs(%21 : tensor<1x1x4x32xf32>) -> tensor<1x1x4x32xf32>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22, %8 : tensor<1x1x4x32xf32>, tensor<1x1x4x32xf32>) outs(%9 : tensor<1x1x4x32xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 4, 32], [0, 1, 2, 4], [0, 0, 0, 0, 1, 1, 4]]>} {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
diff --git a/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir b/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
index 3dfe104..5c568c2 100644
--- a/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
@@ -47,7 +47,7 @@
%19 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:256x128xf16> -> tensor<256x128xf16>
%21 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1204], strides = [1, 1] : !flow.dispatch.tensor<readonly:128x1024xf16> -> tensor<128x1024xf16>
%24 = linalg.init_tensor [256, 1024] : tensor<256x1024xf16>
- %25 = linalg.fill(%cst, %24) : f16, tensor<256x1024xf16> -> tensor<256x1024xf16>
+ %25 = linalg.fill ins(%cst : f16) outs(%24 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
%26 = linalg.matmul ins(%19, %21 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%25 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
%27 = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
diff --git a/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir b/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir
index aa3e66f..cf8ceae 100644
--- a/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir
@@ -27,7 +27,7 @@
%8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:4096x4096xf32> -> tensor<4096x4096xf32>
%10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:4096x4096xf32> -> tensor<4096x4096xf32>
%15 = linalg.init_tensor [4096, 4096] : tensor<4096x4096xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<4096x4096xf32> -> tensor<4096x4096xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<4096x4096xf32>) -> tensor<4096x4096xf32>
%17 = linalg.matmul ins(%8, %10 : tensor<4096x4096xf32>, tensor<4096x4096xf32>) outs(%16 : tensor<4096x4096xf32>) -> tensor<4096x4096xf32>
flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : tensor<4096x4096xf32> -> !flow.dispatch.tensor<writeonly:4096x4096xf32>
return
@@ -78,7 +78,7 @@
%15 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<1024x512xf32>
%17 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:512x256xf32> -> tensor<512x256xf32>
%20 = linalg.init_tensor [1024, 256] : tensor<1024x256xf32>
- %21 = linalg.fill(%cst, %20) : f32, tensor<1024x256xf32> -> tensor<1024x256xf32>
+ %21 = linalg.fill ins(%cst : f32) outs(%20 : tensor<1024x256xf32>) -> tensor<1024x256xf32>
%22 = linalg.matmul ins(%15, %17 : tensor<1024x512xf32>, tensor<512x256xf32>) outs(%21 : tensor<1024x256xf32>) -> tensor<1024x256xf32>
%23 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%22, %10 : tensor<1024x256xf32>, tensor<1024x256xf32>) outs(%13 : tensor<1024x256xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
diff --git a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_batch_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_batch_matmul.mlir
index 980ae2d..aaabd0c 100644
--- a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_batch_matmul.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_batch_matmul.mlir
@@ -51,7 +51,7 @@
%15 = affine.min affine_map<(d0)[s0] -> (-d0 + 1024, s0)>(%arg1)[%workgroup_size_y]
%16 = affine.min affine_map<(d0)[s0] -> (-d0 + 1024, s0)>(%arg2)[%workgroup_size_x]
%17 = linalg.init_tensor [%14, %15, %16] : tensor<?x?x?xf32>
- %18 = linalg.fill(%cst, %17) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
+ %18 = linalg.fill ins(%cst : f32) outs(%17 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
%19 = linalg.batch_matmul {lowering_config = #config} ins(%11, %13 : tensor<?x?x1024xf32>, tensor<?x1024x?xf32>) outs(%18 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
flow.dispatch.tensor.store %19, %2, offsets = [%arg0, %arg1, %arg2], sizes = [%9, %10, %12], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:4x1024x1024xf32>
}
diff --git a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_conv.mlir b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_conv.mlir
index e4ba66d..631ce5e 100644
--- a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_conv.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_conv.mlir
@@ -55,7 +55,7 @@
%19 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
%20 = affine.min affine_map<(d0)[s0] -> (-d0 + 16, s0)>(%arg2)[%workgroup_size_x]
%21 = linalg.init_tensor [1, %18, %19, %20] : tensor<1x?x?x?xf32>
- %22 = linalg.fill(%cst, %21) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
+ %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%23 = linalg.conv_2d_nhwc_hwcf {lowering_config = #config, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%13, %15 : tensor<1x?x?x8xf32>, tensor<3x3x8x?xf32>)
outs(%22 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
@@ -146,7 +146,7 @@
%19 = affine.min affine_map<(d0)[s0] -> (-d0 + 56, s0)>(%arg1)[%workgroup_size_y]
%20 = affine.min affine_map<(d0)[s0] -> (-d0 + 96, s0)>(%arg2)[%workgroup_size_x]
%21 = linalg.init_tensor [1, %18, %19, %20] : tensor<1x?x?x?xf32>
- %22 = linalg.fill(%cst, %21) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
+ %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%23 = linalg.depthwise_conv_2d_nhwc_hwc {lowering_config = #config, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%14, %15 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>)
outs(%22 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
@@ -257,7 +257,7 @@
%36 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
%37 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
%38 = linalg.init_tensor [1, %35, %36, %37] : tensor<1x?x?x?xf32>
- %39 = linalg.fill(%cst, %38) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
+ %39 = linalg.fill ins(%cst : f32) outs(%38 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%40 = linalg.conv_2d_nhwc_hwcf {lowering_config = #config, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%32, %34 : tensor<1x?x?x3xf32>, tensor<3x3x3x?xf32>) outs(%39 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%41 = linalg.generic {lowering_config = #config, indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%40, %16 : tensor<1x?x?x?xf32>, tensor<1x?x?x?xf32>) outs(%20 : tensor<1x?x?x?xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
@@ -386,7 +386,7 @@
%44 = affine.min affine_map<(d0)[s0] -> (-d0 + 112, s0)>(%arg1)[%workgroup_size_y]
%45 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
%46 = linalg.init_tensor [1, %43, %44, %45] : tensor<1x?x?x?xf32>
- %47 = linalg.fill(%cst, %46) : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32>
+ %47 = linalg.fill ins(%cst : f32) outs(%46 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%48 = linalg.depthwise_conv_2d_nhwc_hwc {lowering_config = #config, dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%40, %42 : tensor<1x?x?x?xf32>, tensor<3x3x?xf32>) outs(%47 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
%49 = linalg.generic {lowering_config = #config, indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%12, %48 : tensor<?xf32>, tensor<1x?x?x?xf32>) outs(%18 : tensor<1x?x?x?xf32>) {
^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
diff --git a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_matmul.mlir
index bc18016..816824b 100644
--- a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_matmul.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_matmul.mlir
@@ -42,7 +42,7 @@
%11 = affine.min affine_map<(d0)[s0] -> (-d0 + 4096, s0)>(%arg0)[%workgroup_size_y]
%12 = affine.min affine_map<(d0)[s0] -> (-d0 + 4096, s0)>(%arg1)[%workgroup_size_x]
%13 = linalg.init_tensor [%11, %12] : tensor<?x?xf16>
- %14 = linalg.fill(%cst, %13) : f16, tensor<?x?xf16> -> tensor<?x?xf16>
+ %14 = linalg.fill ins(%cst : f16) outs(%13 : tensor<?x?xf16>) -> tensor<?x?xf16>
%15 = linalg.matmul {lowering_config = #config} ins(%8, %10 : tensor<?x4096xf16>, tensor<4096x?xf16>) outs(%14 : tensor<?x?xf16>) -> tensor<?x?xf16>
flow.dispatch.tensor.store %15, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf16> -> !flow.dispatch.tensor<writeonly:4096x4096xf16>
}
@@ -106,7 +106,7 @@
%11 = affine.min affine_map<(d0)[s0] -> (-d0 + 4096, s0)>(%arg0)[%workgroup_size_y]
%12 = affine.min affine_map<(d0)[s0] -> (-d0 + 4096, s0)>(%arg1)[%workgroup_size_x]
%13 = linalg.init_tensor [%11, %12] : tensor<?x?xf32>
- %14 = linalg.fill(%cst, %13) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
%15 = linalg.matmul {lowering_config = #config} ins(%8, %10 : tensor<?x4096xf32>, tensor<4096x?xf32>) outs(%14 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %15, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:4096x4096xf32>
}
diff --git a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir
index 6b8bb0a..e770765 100644
--- a/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir
@@ -66,7 +66,7 @@
%11 = memref.subview %2[%arg0, 0] [16, 128] [1, 1] : memref<256x128xf16> to memref<16x128xf16, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
%12 = memref.subview %3[0, %arg1] [128, 16] [1, 1] : memref<128x1024xf16> to memref<128x16xf16, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
%13 = memref.subview %4[%arg0, %arg1] [16, 16] [1, 1] : memref<256x1024xf16> to memref<16x16xf16, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
- linalg.fill(%cst, %13) {lowering_config = #config} : f16, memref<16x16xf16, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
+ linalg.fill {lowering_config = #config} ins(%cst : f16) outs(%13 : memref<16x16xf16, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>)
linalg.matmul {lowering_config = #config}
ins(%11, %12 : memref<16x128xf16, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>, memref<128x16xf16, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>)
outs(%13 : memref<16x16xf16, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>)
diff --git a/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir
index d1433c6..6c7ed93 100644
--- a/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir
@@ -28,7 +28,7 @@
%10 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %9) -> (tensor<2x128xf32>) {
%11 = scf.for %arg4 = %c0 to %c128 step %c4 iter_args(%arg5 = %arg3) -> (tensor<2x128xf32>) {
%12 = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<2x128xf32> to tensor<1x4xf32>
- %13 = linalg.fill(%cst, %12) {lowering_config = #config} : f32, tensor<1x4xf32> -> tensor<1x4xf32>
+ %13 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%12 : tensor<1x4xf32>) -> tensor<1x4xf32>
%14 = tensor.extract_slice %7[%arg2, 0] [1, 4] [1, 1] : tensor<2x4xf32> to tensor<1x4xf32>
%15 = tensor.extract_slice %8[0, %arg4] [4, 4] [1, 1] : tensor<4x128xf32> to tensor<4x4xf32>
%16 = linalg.matmul {lowering_config = #config} ins(%14, %15 : tensor<1x4xf32>, tensor<4x4xf32>) outs(%13 : tensor<1x4xf32>) -> tensor<1x4xf32>
diff --git a/iree/compiler/Codegen/Sandbox/test/fusion_expert.mlir b/iree/compiler/Codegen/Sandbox/test/fusion_expert.mlir
index f2cfcdd..beb6930 100644
--- a/iree/compiler/Codegen/Sandbox/test/fusion_expert.mlir
+++ b/iree/compiler/Codegen/Sandbox/test/fusion_expert.mlir
@@ -7,7 +7,7 @@
%d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
%d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
%init = linalg.init_tensor [%d0, %d1] : tensor<?x?xf32>
- %0 = linalg.fill(%cst, %init) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %0 = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
%1 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[10, 20, 30]]>}
ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
@@ -40,7 +40,7 @@
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%init = linalg.init_tensor [20, 120] : tensor<20x120xf32>
- %0 = linalg.fill(%cst, %init) : f32, tensor<20x120xf32> -> tensor<20x120xf32>
+ %0 = linalg.fill ins(%cst : f32) outs(%init : tensor<20x120xf32>) -> tensor<20x120xf32>
%1 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[10, 20, 30]]>}
ins(%arg0, %arg1 : tensor<20x60xf32>, tensor<60x120xf32>)
outs(%0 : tensor<20x120xf32>) -> tensor<20x120xf32>
diff --git a/iree/compiler/Codegen/Sandbox/test/outline_one_parent_loop.mlir b/iree/compiler/Codegen/Sandbox/test/outline_one_parent_loop.mlir
index f6b671d..f774ec1 100644
--- a/iree/compiler/Codegen/Sandbox/test/outline_one_parent_loop.mlir
+++ b/iree/compiler/Codegen/Sandbox/test/outline_one_parent_loop.mlir
@@ -34,7 +34,7 @@
%13 = affine.min affine_map<(d0, d1) -> (8, -d0 + d1)>(%arg7, %5)
%14 = scf.for %arg9 = %c0 to %c16 step %c8 iter_args(%arg10 = %arg8) -> (tensor<?x16xf32>) {
%15 = tensor.extract_slice %arg10[%arg7, %arg9] [%13, 8] [1, 1] : tensor<?x16xf32> to tensor<?x8xf32>
- %16 = linalg.fill(%cst, %15) : f32, tensor<?x8xf32> -> tensor<?x8xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<?x8xf32>) -> tensor<?x8xf32>
%17 = tensor.insert_slice %16 into %arg10[%arg7, %arg9] [%13, 8] [1, 1] : tensor<?x8xf32> into tensor<?x16xf32>
scf.yield %17 : tensor<?x16xf32>
}
diff --git a/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir b/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir
index 17a7944..6f8e598 100644
--- a/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir
+++ b/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir
@@ -113,8 +113,8 @@
%0 = flow.dispatch.tensor.load %arg0, offsets=[0], sizes=[9], strides = [1] : !flow.dispatch.tensor<readonly:9xi32> -> tensor<9xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets=[0], sizes=[9], strides = [1] : !flow.dispatch.tensor<readonly:9xi32> -> tensor<9xi32>
%2 = linalg.init_tensor [] : tensor<i32>
- %3 = linalg.fill(%c-2147483648_i32, %2) : i32, tensor<i32> -> tensor<i32>
- %4 = linalg.fill(%c0_i32, %2) : i32, tensor<i32> -> tensor<i32>
+ %3 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%2 : tensor<i32>) -> tensor<i32>
+ %4 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<i32>) -> tensor<i32>
flow.dispatch.tensor.store %3, %arg2, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
flow.dispatch.tensor.store %4, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
flow.return
@@ -137,14 +137,14 @@
// Used as a result; should remain after canonicalization.
%c-2147483648_i32 = arith.constant -2147483648 : i32
%ret0_init = linalg.init_tensor [] : tensor<i32>
- %ret0_value = linalg.fill(%c-2147483648_i32, %ret0_init) : i32, tensor<i32> -> tensor<i32>
+ %ret0_value = linalg.fill ins(%c-2147483648_i32 : i32) outs(%ret0_init : tensor<i32>) -> tensor<i32>
flow.dispatch.tensor.store %ret0_value, %ret0, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
// Unused as a result; should be stripped entirely.
%c0_i32 = arith.constant 0 : i32
%ret1_shaped = flow.dispatch.tie_shape %ret1 : !flow.dispatch.tensor<writeonly:?xi32>{%dim}
%ret1_init = linalg.init_tensor [%dim] : tensor<?xi32>
- %ret1_value = linalg.fill(%c0_i32, %ret1_init) : i32, tensor<?xi32> -> tensor<?xi32>
+ %ret1_value = linalg.fill ins(%c0_i32 : i32) outs(%ret1_init : tensor<?xi32>) -> tensor<?xi32>
flow.dispatch.tensor.store %ret1_value, %ret1_shaped, offsets = [0], sizes = [%dim], strides = [1] : tensor<?xi32> -> !flow.dispatch.tensor<writeonly:?xi32>{%dim}
flow.return
}
@@ -167,8 +167,8 @@
%0 = flow.dispatch.tensor.load %arg0, offsets=[0], sizes=[9], strides = [1] : !flow.dispatch.tensor<readonly:9xi32> -> tensor<9xi32>
%1 = flow.dispatch.tensor.load %arg1, offsets=[0], sizes=[9], strides = [1] : !flow.dispatch.tensor<readonly:9xi32> -> tensor<9xi32>
%2 = linalg.init_tensor [] : tensor<i32>
- %3 = linalg.fill(%c-2147483648_i32, %2) : i32, tensor<i32> -> tensor<i32>
- %4 = linalg.fill(%c0_i32, %2) : i32, tensor<i32> -> tensor<i32>
+ %3 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%2 : tensor<i32>) -> tensor<i32>
+ %4 = linalg.fill ins(%c0_i32 : i32) outs(%2 : tensor<i32>) -> tensor<i32>
flow.dispatch.tensor.store %3, %arg2, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
flow.dispatch.tensor.store %4, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<readwrite:i32>
flow.return
@@ -190,8 +190,8 @@
%val = tensor.extract %0[] : tensor<i32>
%1 = flow.dispatch.tensor.load %arg1, offsets=[0], sizes=[9], strides = [1] : !flow.dispatch.tensor<readonly:9xi32> -> tensor<9xi32>
%2 = linalg.init_tensor [] : tensor<i32>
- %3 = linalg.fill(%c-2147483648_i32, %2) : i32, tensor<i32> -> tensor<i32>
- %4 = linalg.fill(%val, %2) : i32, tensor<i32> -> tensor<i32>
+ %3 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%2 : tensor<i32>) -> tensor<i32>
+ %4 = linalg.fill ins(%val : i32) outs(%2 : tensor<i32>) -> tensor<i32>
flow.dispatch.tensor.store %3, %arg2, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
flow.dispatch.tensor.store %4, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<readwrite:i32>
flow.return
diff --git a/iree/compiler/Dialect/Flow/Transforms/test/convert_linalg_tensor_ops_after.mlir b/iree/compiler/Dialect/Flow/Transforms/test/convert_linalg_tensor_ops_after.mlir
index 3d8ccde..4dda3a4 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/convert_linalg_tensor_ops_after.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/convert_linalg_tensor_ops_after.mlir
@@ -9,7 +9,7 @@
%3 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%1)[%arg2, %arg4]
%4 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%2)[%arg3, %arg5]
%5 = linalg.init_tensor [%3, %4] : tensor<?x?xf32>
- %6 = linalg.fill(%0, %5) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %6 = linalg.fill ins(%0 : f32) outs(%5 : tensor<?x?xf32>) -> tensor<?x?xf32>
%7 = flow.tensor.update %arg0, %6[%arg2, %arg3] : tensor<?x?xf32>{%1, %2} -> %6 as tensor<?x?xf32>{%3, %4}
return %7 : tensor<?x?xf32>
}
diff --git a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
index 60ff4c6..0755c49 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
@@ -90,7 +90,7 @@
%M = tensor.dim %A, %c0 : tensor<?x?xf32>
%N = tensor.dim %B, %c1 : tensor<?x?xf32>
%0 = linalg.init_tensor [%M, %N] : tensor<?x?xf32>
- %1 = linalg.fill(%zero, %0) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %1 = linalg.fill ins(%zero : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
%2 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
outs(%1 : tensor<?x?xf32>) -> tensor<?x?xf32>
return %2 : tensor<?x?xf32>
@@ -115,7 +115,9 @@
// CHECK-DAG: %[[LHS:.+]] = flow.dispatch.tensor.load %[[ARG0_CAPTURE]], {{.*}} : !flow.dispatch.tensor<readonly:?x?xf32>{%[[ARG0_DIM0_CAPTURE]], %[[ARG0_DIM1_CAPTURE]]}
// CHECK-DAG: %[[RHS:.+]] = flow.dispatch.tensor.load %[[ARG1_CAPTURE]], {{.*}} : !flow.dispatch.tensor<readonly:?x?xf32>{%[[ARG1_DIM0_CAPTURE]], %[[ARG1_DIM1_CAPTURE]]}
// CHECK-DAG: %[[INIT:.+]] = linalg.init_tensor
-// CHECK: %[[FILL:.+]] = linalg.fill(%[[ZERO]], %[[INIT]])
+// CHECK: %[[FILL:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[ZERO]] :
+// CHECK-SAME: outs(%[[INIT]] :
// CHECK: %[[RESULT:.+]] = linalg.matmul
// CHECK-SAME: ins(%[[LHS]], %[[RHS]] : tensor<?x?xf32>, tensor<?x?xf32>)
// CHECK-SAME: outs(%[[FILL]] : tensor<?x?xf32>)
@@ -133,7 +135,7 @@
%N = tensor.dim %B, %c1 : tensor<?x?xf32>
%K = tensor.dim %A, %c1 : tensor<?x?xf32>
%0 = linalg.init_tensor [%M, %N] : tensor<?x?xf32>
- %1 = linalg.fill(%zero, %0) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %1 = linalg.fill ins(%zero : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
%2 = linalg.init_tensor [%M, %K] : tensor<?x?xf32>
%3 = linalg.generic
{indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
@@ -172,7 +174,9 @@
// CHECK: flow.dispatch.workgroups[%[[N]], %[[M]], %[[C1]]]
// CHECK: %[[ZERO:.+]] = arith.constant 0.0
// CHECK: %[[INIT:.+]] = linalg.init_tensor
-// CHECK: %[[FILL:.+]] = linalg.fill(%[[ZERO]], %[[INIT]])
+// CHECK: %[[FILL:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[ZERO]] :
+// CHECK-SAME: outs(%[[INIT]] :
// CHECK: linalg.matmul
// CHECK: outs(%[[FILL]] : tensor<?x?xf32>)
@@ -230,13 +234,13 @@
%m = tensor.dim %0, %c0 : tensor<?x4xf32>
%n1 = tensor.dim %rhs1, %c1 : tensor<4x?xf32>
%init1 = linalg.init_tensor [%m, %n1] : tensor<?x?xf32>
- %fill1 = linalg.fill(%cst, %init1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %fill1 = linalg.fill ins(%cst : f32) outs(%init1 : tensor<?x?xf32>) -> tensor<?x?xf32>
%1 = linalg.matmul
ins(%0, %rhs1 : tensor<?x4xf32>, tensor<4x?xf32>)
outs(%fill1 : tensor<?x?xf32>) -> tensor<?x?xf32>
%n2 = tensor.dim %rhs2, %c1 : tensor<4x?xf32>
%init2 = linalg.init_tensor [%m, %n2] : tensor<?x?xf32>
- %fill2 = linalg.fill(%cst, %init2) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %fill2 = linalg.fill ins(%cst : f32) outs(%init2 : tensor<?x?xf32>) -> tensor<?x?xf32>
%2= linalg.matmul
ins(%0, %rhs2 : tensor<?x4xf32>, tensor<4x?xf32>)
outs(%fill2 : tensor<?x?xf32>) -> tensor<?x?xf32>
@@ -275,7 +279,7 @@
%3 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%1)[%arg2, %arg4]
%4 = affine.apply affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>(%2)[%arg3, %arg5]
%5 = linalg.init_tensor [%3, %4] : tensor<?x?xf32>
- %6 = linalg.fill(%0, %5) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %6 = linalg.fill ins(%0 : f32) outs(%5 : tensor<?x?xf32>) -> tensor<?x?xf32>
%7 = flow.tensor.update %arg0, %6[%arg2, %arg3] : tensor<?x?xf32>{%1, %2} -> %6 as tensor<?x?xf32>{%3, %4}
return %7 : tensor<?x?xf32>
}
@@ -365,7 +369,7 @@
func @conv2d(%input: tensor<1x225x225x16xf32>, %filter: tensor<3x3x16x32xf32>) -> tensor<1x112x112x32xf32> {
%0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%cst = arith.constant 0.000000e+00 : f32
- %1 = linalg.fill(%cst, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+ %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%2 = linalg.conv_2d_nhwc_hwcf
{dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)
@@ -387,7 +391,7 @@
func @depthwise_conv2d(%input: tensor<1x113x113x96xf32>, %filter: tensor<3x3x96xf32>) -> tensor<1x56x56x96xf32> {
%cst = arith.constant 0.000000e+00 : f32
%1 = linalg.init_tensor [1, 56, 56, 96] : tensor<1x56x56x96xf32>
- %2 = linalg.fill(%cst, %1) : f32, tensor<1x56x56x96xf32> -> tensor<1x56x56x96xf32>
+ %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32>
%4 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%input, %filter : tensor<1x113x113x96xf32>, tensor<3x3x96xf32>) outs(%2 : tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32>
return %4 : tensor<1x56x56x96xf32>
}
@@ -448,7 +452,7 @@
func @fuse_non_tiled_reduction_fill(%input1: tensor<1000xf32>, %input2: tensor<1000xf32>, %offset: tensor<f32>) -> tensor<f32> {
%zero = arith.constant 0.0 : f32
%init = linalg.init_tensor [] : tensor<f32>
- %fill = linalg.fill(%zero, %init) : f32, tensor<f32> -> tensor<f32>
+ %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<f32>) -> tensor<f32>
%reduce = linalg.generic {
indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>, affine_map<(d0) -> ()>],
iterator_types = ["reduction"]}
@@ -701,8 +705,8 @@
%c0 = arith.constant 0 : index
%0 = tensor.dim %arg0, %c0 : tensor<?x?xi32>
%1 = linalg.init_tensor [%0] : tensor<?xi32>
- %2 = linalg.fill(%cmin, %1) : i32, tensor<?xi32> -> tensor<?xi32>
- %3 = linalg.fill(%c0_i32, %1) : i32, tensor<?xi32> -> tensor<?xi32>
+ %2 = linalg.fill ins(%cmin : i32) outs(%1 : tensor<?xi32>) -> tensor<?xi32>
+ %3 = linalg.fill ins(%c0_i32 : i32) outs(%1 : tensor<?xi32>) -> tensor<?xi32>
%4:2 = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>,
affine_map<(d0, d1) -> (d1, d0)>,
@@ -788,7 +792,7 @@
%2 = tensor.dim %0, %c0 : tensor<?x?xf32>
%3 = tensor.dim %1, %c1 : tensor<?x?xf32>
%4 = linalg.init_tensor [%2, %3] : tensor<?x?xf32>
- %5 = linalg.fill(%cst, %4) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
%6 = linalg.matmul ins(%0, %1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%5 : tensor<?x?xf32>) -> tensor<?x?xf32>
%7 = tensor.dim %6, %c0 : tensor<?x?xf32>
%8 = tensor.dim %6, %c1 : tensor<?x?xf32>
@@ -927,7 +931,7 @@
func @pooling_nwhc_sum_static(%input: tensor<1x33x33x160xf32>) -> tensor<1x3x3x160xf32> {
%cst = arith.constant 0.0 : f32
%1 = linalg.init_tensor [1, 3, 3, 160] : tensor<1x3x3x160xf32>
- %2 = linalg.fill(%cst, %1) : f32, tensor<1x3x3x160xf32> -> tensor<1x3x3x160xf32>
+ %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x3x3x160xf32>) -> tensor<1x3x3x160xf32>
%3 = linalg.init_tensor [11, 11] : tensor<11x11xf32>
%4 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<11> : vector<2xi64>} ins(%input, %3 : tensor<1x33x33x160xf32>, tensor<11x11xf32>) outs(%2 : tensor<1x3x3x160xf32>) -> tensor<1x3x3x160xf32>
return %4 : tensor<1x3x3x160xf32>
diff --git a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion.mlir b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion.mlir
index 6da28d8..d24826e 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion.mlir
@@ -3,7 +3,7 @@
func @fuse_conv2d_elementwise(%input: tensor<1x225x225x16xf32>, %filter: tensor<3x3x16x32xf32>, %offset: tensor<32xf32>) -> tensor<1x112x112x32xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
- %1 = linalg.fill(%cst, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+ %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%2 = linalg.conv_2d_nhwc_hwcf
{dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)
@@ -33,7 +33,8 @@
// CHECK: flow.dispatch.workgroups
// CHECK: %[[INIT:.+]] = linalg.init_tensor
-// CHECK: %[[FILL:.+]] = linalg.fill(%{{.+}}, %[[INIT]])
+// CHECK: %[[FILL:.+]] = linalg.fill
+// CHECK-SAME: outs(%[[INIT]] :
// CHECK: %[[CONV:.+]] = linalg.conv_2d_nhwc_hwcf
// CHECK-SAME: outs(%[[FILL]] :
// CHECK: linalg.generic
@@ -46,7 +47,7 @@
-> (tensor<1x112x112x32xf32>, tensor<1x112x112x32xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
- %1 = linalg.fill(%cst, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+ %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%2 = linalg.conv_2d_nhwc_hwcf
{dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)
@@ -80,7 +81,7 @@
func @dont_fuse_conv2d_with_non_identity_map(%input: tensor<1x225x225x16xf32>, %filter: tensor<3x3x16x32xf32>, %offset: tensor<32xf32>) -> tensor<1x112x112x32xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
- %1 = linalg.fill(%cst, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
+ %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
%2 = linalg.conv_2d_nhwc_hwcf
{dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)
diff --git a/iree/compiler/Dialect/Flow/Transforms/test/infer_numeric_narrowing.mlir b/iree/compiler/Dialect/Flow/Transforms/test/infer_numeric_narrowing.mlir
index ac21741..ef440ec 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/infer_numeric_narrowing.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/infer_numeric_narrowing.mlir
@@ -18,7 +18,7 @@
[[3.900000e+01], [0.000000e+00], [1.270000e+02]]> : tensor<3x1xf32>
%init_value = arith.constant 0.000000e+00 : f32
%0 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
- %1 = linalg.fill(%init_value, %0) : f32, tensor<5x1xf32> -> tensor<5x1xf32>
+ %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
%2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
return %2 : tensor<5x1xf32>
}
@@ -30,7 +30,7 @@
[[-3.900000e+01], [0.000000e+00], [1.270000e+02]]> : tensor<3x1xf32>
%init_value = arith.constant 0.000000e+00 : f32
%0 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
- %1 = linalg.fill(%init_value, %0) : f32, tensor<5x1xf32> -> tensor<5x1xf32>
+ %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
%2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
return %2 : tensor<5x1xf32>
}
@@ -43,7 +43,7 @@
[[0.000000e+00], [0.000000e+00], [-1.000000e+00]]> : tensor<3x1xf32>
%init_value = arith.constant 0.000000e+00 : f32
%0 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
- %1 = linalg.fill(%init_value, %0) : f32, tensor<5x1xf32> -> tensor<5x1xf32>
+ %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
%2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
return %2 : tensor<5x1xf32>
}
@@ -56,7 +56,7 @@
[[1.000000e+00], [1.000000e+00], [2.000000e+00]]> : tensor<3x1xf32>
%init_value = arith.constant 0.000000e+00 : f32
%0 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
- %1 = linalg.fill(%init_value, %0) : f32, tensor<5x1xf32> -> tensor<5x1xf32>
+ %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
%2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
return %2 : tensor<5x1xf32>
}
@@ -69,7 +69,7 @@
[[-1.000000e+00], [-1.000000e+00], [-2.000000e+00]]> : tensor<3x1xf32>
%init_value = arith.constant 0.000000e+00 : f32
%0 = linalg.init_tensor [5, 1] : tensor<5x1xf32>
- %1 = linalg.fill(%init_value, %0) : f32, tensor<5x1xf32> -> tensor<5x1xf32>
+ %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
%2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
return %2 : tensor<5x1xf32>
}
diff --git a/iree/compiler/Dialect/Flow/Transforms/test/matmul_to_mmt4d.mlir b/iree/compiler/Dialect/Flow/Transforms/test/matmul_to_mmt4d.mlir
index 39a5eb8..fa5b8ea 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/matmul_to_mmt4d.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/matmul_to_mmt4d.mlir
@@ -71,7 +71,7 @@
func @check_mmt4d_with_init_tensor_and_fill(%arg0: tensor<24x8xf32>, %arg1: tensor<8x32xf32>) -> tensor<24x32xf32> {
%c0 = arith.constant 0.0 : f32
%0 = linalg.init_tensor [24, 32] : tensor<24x32xf32>
- %1 = linalg.fill(%c0, %0) : f32, tensor<24x32xf32> -> tensor<24x32xf32>
+ %1 = linalg.fill ins(%c0 : f32) outs(%0 : tensor<24x32xf32>) -> tensor<24x32xf32>
%2 = linalg.matmul ins(%arg0, %arg1 : tensor<24x8xf32>, tensor<8x32xf32>) outs(%1 : tensor<24x32xf32>) -> tensor<24x32xf32>
return %2 : tensor<24x32xf32>
}
@@ -85,7 +85,8 @@
// CHECK: %[[RHS4D:.+]] = tensor.expand_shape %[[RHS]]
// CHECK-SAME: tensor<8x32xf32> into tensor<4x2x8x4xf32>
// CHECK: %[[DST_INIT:.+]] = linalg.init_tensor [3, 8, 8, 4] : tensor<3x8x8x4xf32>
-// CHECK: [[DST:.+]] linalg.fill(%[[ZERO:.+]], %[[DST_INIT]])
+// CHECK: [[DST:.+]] linalg.fill
+// CHECK-SAME: outs(%[[DST_INIT]] :
// -----
func @check_mmt4d_i8_static_pad(%arg0: tensor<3x5xi8>, %arg1: tensor<5x2xi8>, %arg2: tensor<3x2xi32>) -> tensor<3x2xi32> {
diff --git a/iree/compiler/Dialect/Flow/Transforms/test/optimize_numerics.mlir b/iree/compiler/Dialect/Flow/Transforms/test/optimize_numerics.mlir
index a39ebe0..ae14cd0 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/optimize_numerics.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/optimize_numerics.mlir
@@ -60,9 +60,9 @@
func @cast_fill(%arg0 : f32, %arg1 : tensor<3xf32>) -> tensor<3xi8> {
// CHECK: %[[SCALAR:.*]] = arith.fptosi %arg0 : f32 to i8
// CHECK: %[[INIT:.*]] = arith.fptosi %arg1 : tensor<3xf32> to tensor<3xi8>
- // CHECK: %[[RESULT:.*]] = linalg.fill(%[[SCALAR]], %[[INIT]]) : i8, tensor<3xi8> -> tensor<3xi8>
+ // CHECK: %[[RESULT:.*]] = linalg.fill ins(%[[SCALAR]] : i8) outs(%[[INIT]] : tensor<3xi8>) -> tensor<3xi8>
// CHECK: return %[[RESULT]]
- %0 = linalg.fill(%arg0, %arg1) : f32, tensor<3xf32> -> tensor<3xf32>
+ %0 = linalg.fill ins(%arg0 : f32) outs(%arg1 : tensor<3xf32>) -> tensor<3xf32>
%1 = arith.fptosi %0 : tensor<3xf32> to tensor<3xi8>
return %1 : tensor<3xi8>
}
diff --git a/iree/compiler/Dialect/Flow/Transforms/test/pad_tensor_to_tensor.mlir b/iree/compiler/Dialect/Flow/Transforms/test/pad_tensor_to_tensor.mlir
index 316023c..b3cf275 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/pad_tensor_to_tensor.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/pad_tensor_to_tensor.mlir
@@ -28,7 +28,9 @@
// CHECK-DAG: %[[RD0:.+]] = affine.apply #[[MAP0]]()[%[[ARG3]], %[[D0]]]
// CHECK-DAG: %[[RD1:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]], %[[D1]]]
// CHECK: %[[INIT:.+]] = linalg.init_tensor [%[[RD0]], %[[RD1]]]
-// CHECK: %[[FILL:.+]] = linalg.fill(%[[VAL]], %[[INIT]])
+// CHECK: %[[FILL:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[VAL]] :
+// CHECK-SAME: outs(%[[INIT]] :
// CHECK: %[[RESULT:.+]] = tensor.insert_slice %[[ARG0]] into %[[FILL]][4, %[[ARG2]]] [%[[D0]], %[[D1]]] [1, 1]
// CHECK: return %[[RESULT]]
@@ -53,6 +55,8 @@
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: tensor<f32>
// CHECK-DAG: %[[VAL:.+]] = tensor.extract %[[ARG1]]
// CHECK: %[[INIT:.+]] = linalg.init_tensor [18, 12]
-// CHECK: %[[FILL:.+]] = linalg.fill(%[[VAL]], %[[INIT]])
+// CHECK: %[[FILL:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[VAL]] :
+// CHECK-SAME: outs(%[[INIT]] :
// CHECK: %[[RESULT:.+]] = tensor.insert_slice %[[ARG0]] into %[[FILL]][4, 5] [12, 4] [1, 1]
// CHECK: return %[[RESULT]]
diff --git a/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir b/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir
index 6f417c8..a147963 100644
--- a/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir
+++ b/iree/compiler/Dialect/Stream/Builtins/fill_i64.mlir
@@ -20,7 +20,7 @@
scf.for %i = %1 to %count step %2 {
%3 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%i)[%count, %workgroup_size_0]
%4 = linalg.init_tensor [%3] : tensor<?xi64>
- %5 = linalg.fill(%value, %4) : i64, tensor<?xi64> -> tensor<?xi64>
+ %5 = linalg.fill ins(%value : i64) outs(%4 : tensor<?xi64>) -> tensor<?xi64>
flow.dispatch.tensor.store %5, %out, offsets = [%i], sizes = [%3], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:?xi64>{%count}
}
return
diff --git a/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir b/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir
index 4daad1e..a701eae 100644
--- a/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir
+++ b/iree/compiler/Dialect/Stream/Builtins/splat_i64.mlir
@@ -21,7 +21,7 @@
scf.for %i = %1 to %count step %2 {
%3 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%i)[%count, %workgroup_size_0]
%4 = linalg.init_tensor [%3] : tensor<?xi64>
- %5 = linalg.fill(%value, %4) : i64, tensor<?xi64> -> tensor<?xi64>
+ %5 = linalg.fill ins(%value : i64) outs(%4 : tensor<?xi64>) -> tensor<?xi64>
flow.dispatch.tensor.store %5, %out, offsets = [%i], sizes = [%3], strides = [1] : tensor<?xi64> -> !flow.dispatch.tensor<writeonly:?xi64>{%count}
}
return
diff --git a/iree/compiler/InputConversion/Common/test/linalg_quantized_matmul_to_matmul.mlir b/iree/compiler/InputConversion/Common/test/linalg_quantized_matmul_to_matmul.mlir
index 1ef6e39..67a0c07 100644
--- a/iree/compiler/InputConversion/Common/test/linalg_quantized_matmul_to_matmul.mlir
+++ b/iree/compiler/InputConversion/Common/test/linalg_quantized_matmul_to_matmul.mlir
@@ -30,7 +30,9 @@
// CHECK: %[[MATMUL:.+]] = linalg.matmul ins(%[[LHS]], %[[RHS]] : tensor<?x?xi8>, tensor<?x?xi8>) outs(%[[ACC]] : tensor<?x?xi32>)
// CHECK-DAG: %[[INIT_RESULT:.+]] = linalg.init_tensor
// CHECK-DAG: %[[INIT_LHS_SUMS_ACC:.+]] = linalg.init_tensor
-// CHECK: %[[ZERO_LHS_SUMS_ACC:.+]] = linalg.fill(%[[C0_I32]], %[[INIT_LHS_SUMS_ACC]])
+// CHECK: %[[ZERO_LHS_SUMS_ACC:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[C0_I32]] :
+// CHECK-SAME: outs(%[[INIT_LHS_SUMS_ACC]] :
// CHECK: %[[LHS_SUMS:.+]] = linalg.generic
// CHECK-SAME: "parallel", "reduction"
// CHECK-SAME: ins(%[[LHS]] : tensor<?x?xi8>)
@@ -55,7 +57,9 @@
// CHECK: %[[MATMUL:.+]] = linalg.matmul ins(%[[LHS]], %[[RHS]] : tensor<?x?xi8>, tensor<?x?xi8>) outs(%[[ACC]] : tensor<?x?xi32>)
// CHECK-DAG: %[[INIT_RESULT:.+]] = linalg.init_tensor
// CHECK-DAG: %[[INIT_RHS_SUMS_ACC:.+]] = linalg.init_tensor
-// CHECK: %[[ZERO_RHS_SUMS_ACC:.+]] = linalg.fill(%[[C0_I32]], %[[INIT_RHS_SUMS_ACC]])
+// CHECK: %[[ZERO_RHS_SUMS_ACC:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[C0_I32]] :
+// CHECK-SAME: outs(%[[INIT_RHS_SUMS_ACC]] :
// CHECK: %[[RHS_SUMS:.+]] = linalg.generic
// CHECK-SAME: "reduction", "parallel"
// CHECK-SAME: ins(%[[RHS]] : tensor<?x?xi8>)
@@ -80,13 +84,17 @@
// CHECK: %[[MATMUL:.+]] = linalg.matmul ins(%[[LHS]], %[[RHS]] : tensor<?x?xi8>, tensor<?x?xi8>) outs(%[[ACC]] : tensor<?x?xi32>)
// CHECK-DAG: %[[INIT_RESULT:.+]] = linalg.init_tensor
// CHECK-DAG: %[[INIT_LHS_SUMS_ACC:.+]] = linalg.init_tensor
-// CHECK: %[[ZERO_LHS_SUMS_ACC:.+]] = linalg.fill(%[[C0_I32]], %[[INIT_LHS_SUMS_ACC]])
+// CHECK: %[[ZERO_LHS_SUMS_ACC:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[C0_I32]] :
+// CHECK-SAME: outs(%[[INIT_LHS_SUMS_ACC]] :
// CHECK: %[[LHS_SUMS:.+]] = linalg.generic
// CHECK-SAME: "parallel", "reduction"
// CHECK-SAME: ins(%[[LHS]] : tensor<?x?xi8>)
// CHECK-SAME: outs(%[[ZERO_LHS_SUMS_ACC]] : tensor<?xi32>)
// CHECK: %[[INIT_RHS_SUMS_ACC:.+]] = linalg.init_tensor
-// CHECK: %[[ZERO_RHS_SUMS_ACC:.+]] = linalg.fill(%[[C0_I32]], %[[INIT_RHS_SUMS_ACC]])
+// CHECK: %[[ZERO_RHS_SUMS_ACC:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[C0_I32]] :
+// CHECK-SAME: outs(%[[INIT_RHS_SUMS_ACC]] :
// CHECK: %[[RHS_SUMS:.+]] = linalg.generic
// CHECK-SAME: "reduction", "parallel"
// CHECK-SAME: ins(%[[RHS]] : tensor<?x?xi8>)
@@ -115,13 +123,17 @@
// CHECK: %[[MATMUL:.+]] = linalg.matmul ins(%[[LHS]], %[[RHS]] : tensor<3x4xi8>, tensor<4x5xi8>) outs(%[[ACC]] : tensor<3x5xi32>)
// CHECK-DAG: %[[INIT_RESULT:.+]] = linalg.init_tensor
// CHECK-DAG: %[[INIT_LHS_SUMS_ACC:.+]] = linalg.init_tensor
-// CHECK: %[[ZERO_LHS_SUMS_ACC:.+]] = linalg.fill(%[[C0_I32]], %[[INIT_LHS_SUMS_ACC]])
+// CHECK: %[[ZERO_LHS_SUMS_ACC:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[C0_I32]] :
+// CHECK-SAME: outs(%[[INIT_LHS_SUMS_ACC]] :
// CHECK: %[[LHS_SUMS:.+]] = linalg.generic
// CHECK-SAME: "parallel", "reduction"
// CHECK-SAME: ins(%[[LHS]] : tensor<3x4xi8>)
// CHECK-SAME: outs(%[[ZERO_LHS_SUMS_ACC]] : tensor<3xi32>)
// CHECK: %[[INIT_RHS_SUMS_ACC:.+]] = linalg.init_tensor
-// CHECK: %[[ZERO_RHS_SUMS_ACC:.+]] = linalg.fill(%[[C0_I32]], %[[INIT_RHS_SUMS_ACC]])
+// CHECK: %[[ZERO_RHS_SUMS_ACC:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[C0_I32]] :
+// CHECK-SAME: outs(%[[INIT_RHS_SUMS_ACC]] :
// CHECK: %[[RHS_SUMS:.+]] = linalg.generic
// CHECK-SAME: "reduction", "parallel"
// CHECK-SAME: ins(%[[RHS]] : tensor<4x5xi8>)
diff --git a/iree/compiler/InputConversion/MHLO/BUILD b/iree/compiler/InputConversion/MHLO/BUILD
index 00b5217..e8093bc 100644
--- a/iree/compiler/InputConversion/MHLO/BUILD
+++ b/iree/compiler/InputConversion/MHLO/BUILD
@@ -93,6 +93,10 @@
"@llvm-project//mlir:Transforms",
"@mlir-hlo//:chlo_legalize_to_hlo",
"@mlir-hlo//:hlo",
+ "@mlir-hlo//:hlo_legalize_shape_ops_to_standard",
+ "@mlir-hlo//:hlo_legalize_to_arithmetic",
+ "@mlir-hlo//:hlo_legalize_to_lhlo",
+ "@mlir-hlo//:hlo_legalize_to_memref",
"@mlir-hlo//:legalize_control_flow",
"@mlir-hlo//:legalize_einsum_to_dot_general",
"@mlir-hlo//:legalize_gather_to_torch_index_select",
diff --git a/iree/compiler/InputConversion/MHLO/CMakeLists.txt b/iree/compiler/InputConversion/MHLO/CMakeLists.txt
index ccd8640..c153e45 100644
--- a/iree/compiler/InputConversion/MHLO/CMakeLists.txt
+++ b/iree/compiler/InputConversion/MHLO/CMakeLists.txt
@@ -83,7 +83,11 @@
MLIRTransforms
MhloDialect
MhloPasses
+ MhloShapeOpsToStandard
+ MhloToArithmeticConversion
+ MhloToLhloConversion
MhloToLinalg
+ MhloToMemrefConversion
MhloToStandard
iree::compiler::Dialect::Flow::IR
iree::compiler::Dialect::Flow::Transforms
diff --git a/iree/compiler/InputConversion/MHLO/MHLOToLinalgOnTensors.cpp b/iree/compiler/InputConversion/MHLO/MHLOToLinalgOnTensors.cpp
index 99fab42..ebc4504 100644
--- a/iree/compiler/InputConversion/MHLO/MHLOToLinalgOnTensors.cpp
+++ b/iree/compiler/InputConversion/MHLO/MHLOToLinalgOnTensors.cpp
@@ -84,7 +84,7 @@
rewriter.createOrFold<arith::AddIOp>(loc, resultDimSize, size);
}
sizes[dim] = resultDimSize;
- auto initTensor = rewriter.create<linalg::InitTensorOp>(
+ Value initTensor = rewriter.create<linalg::InitTensorOp>(
loc, resultType.getShape(), resultType.getElementType());
auto zeroAttr = rewriter.getZeroAttr(resultType.getElementType());
Value zero = rewriter.create<arith::ConstantOp>(loc, zeroAttr);
@@ -146,7 +146,7 @@
Value rhs) {
Value zero = b.create<arith::ConstantOp>(
loc, b.getZeroAttr(resultType.getElementType()));
- auto initTensor = b.create<linalg::InitTensorOp>(
+ Value initTensor = b.create<linalg::InitTensorOp>(
loc, /*dyn_size=*/ValueRange{}, resultType.getShape(),
resultType.getElementType());
Value zeroTensor =
diff --git a/iree/compiler/InputConversion/MHLO/MHLOToMHLOPreprocessing.cpp b/iree/compiler/InputConversion/MHLO/MHLOToMHLOPreprocessing.cpp
index 571ea7f..ceff92c 100644
--- a/iree/compiler/InputConversion/MHLO/MHLOToMHLOPreprocessing.cpp
+++ b/iree/compiler/InputConversion/MHLO/MHLOToMHLOPreprocessing.cpp
@@ -79,71 +79,6 @@
.getResult();
}
-class ExtractConvOpPaddingAttributes : public OpRewritePattern<mhlo::ConvOp> {
- public:
- using OpRewritePattern<mhlo::ConvOp>::OpRewritePattern;
-
- LogicalResult matchAndRewrite(mhlo::ConvOp op,
- PatternRewriter &rewriter) const override {
- if (!hasPadding(op)) return failure();
- auto inputType = op.lhs().getType().cast<ShapedType>();
- int rank = inputType.getRank();
-
- // TODO(suderman): Add proper support for padding + dilation for codegen.
- // We can't extract padding if the left hand side has dilation.
- if (op.lhs_dilation().hasValue()) {
- for (auto val : op.lhs_dilation().getValue().getValues<APInt>()) {
- if (val != 1) {
- return failure();
- }
- }
- }
-
- SmallVector<int64_t, 4> paddingLow, paddingHigh, interiorPadding, shape;
- paddingLow.append(rank, 0);
- paddingHigh.append(rank, 0);
- interiorPadding.append(rank, 0);
- for (auto iter :
- llvm::enumerate(op.dimension_numbers().getInputSpatialDimensions())) {
- unsigned idx = iter.index();
- unsigned dim = iter.value();
- paddingLow[dim] = op.paddingAttr().getValues<int64_t>()[{idx, 0}];
- paddingHigh[dim] = op.paddingAttr().getValues<int64_t>()[{idx, 1}];
- }
- for (unsigned i = 0; i < rank; ++i) {
- // mhlo.pad doesn't support dynamic shape.
- if (inputType.isDynamicDim(i)) return failure();
- int size = inputType.getShape()[i];
- shape.push_back(size + paddingLow[i] + paddingHigh[i]);
- }
-
- auto toDenseAttr = [&rewriter](ArrayRef<int64_t> elements) {
- return DenseIntElementsAttr::get(
- RankedTensorType::get(elements.size(), rewriter.getIntegerType(64)),
- elements);
- };
-
- auto loc = op.getLoc();
- auto padResultType =
- RankedTensorType::get(shape, inputType.getElementType());
- Attribute zeroAttr = rewriter.getZeroAttr(
- RankedTensorType::get({}, inputType.getElementType()));
- auto zero = rewriter.create<arith::ConstantOp>(loc, zeroAttr);
- auto padOp = rewriter.create<mhlo::PadOp>(
- loc, padResultType, op.lhs(), zero, toDenseAttr(paddingLow),
- toDenseAttr(paddingHigh), toDenseAttr(interiorPadding));
- auto resultType = op.getResult().getType();
- auto newOp = rewriter.create<mhlo::ConvOp>(
- op.getLoc(), resultType, padOp.getResult(), op.rhs(),
- op.window_stridesAttr(), /*padding=*/nullptr, op.lhs_dilationAttr(),
- op.rhs_dilationAttr(), /*window_reversal=*/nullptr,
- op.dimension_numbersAttr(), op.feature_group_countAttr(),
- op.batch_group_countAttr(), op.precision_configAttr());
- rewriter.replaceOp(op, newOp.getResult());
- return success();
- }
-};
-
// Guarantee that the input dimensions are ordered batch, spatial_dims, feature
// dim.
class ReorderConvOpInputDimensions : public OpRewritePattern<mhlo::ConvOp> {
@@ -337,70 +272,6 @@
}
};
-class ExtractReduceWindowOpPaddingAttributes
- : public OpRewritePattern<mhlo::ReduceWindowOp> {
- public:
- using OpRewritePattern<mhlo::ReduceWindowOp>::OpRewritePattern;
-
- LogicalResult matchAndRewrite(mhlo::ReduceWindowOp op,
- PatternRewriter &rewriter) const override {
- if (!op.padding()) return failure();
-
- if ((op.base_dilations() && !isSplatValue(*op.base_dilations(), 1)) ||
- (op.window_dilations() && !isSplatValue(*op.window_dilations(), 1))) {
- return failure();
- }
- if (isAllZero(op.paddingAttr())) return failure();
-
- // All inputs must be of the same static shape, since
- // mhlo.pad doesn't support dynamic shape.
- for (Type inputType : op.inputs().getType()) {
- if (!inputType.cast<ShapedType>().hasStaticShape()) return failure();
- }
- ArrayRef<int64_t> inputShape =
- op.inputs()[0].getType().cast<ShapedType>().getShape();
-
- int rank = inputShape.size();
- SmallVector<int64_t, 4> paddingLow, paddingHigh, interiorPadding, shape;
- for (unsigned i = 0; i < rank; ++i) {
- interiorPadding.push_back(0);
- paddingLow.push_back(op.paddingAttr().getValues<int64_t>()[{i, 0}]);
- paddingHigh.push_back(op.paddingAttr().getValues<int64_t>()[{i, 1}]);
- int size = inputShape[i];
- shape.push_back(size + paddingLow.back() + paddingHigh.back());
- }
-
- auto toDenseAttr = [&rewriter](ArrayRef<int64_t> elements) {
- return DenseIntElementsAttr::get(
- RankedTensorType::get(elements.size(), rewriter.getIntegerType(64)),
- elements);
- };
-
- SmallVector<Value> padOps;
- padOps.reserve(op.inputs().size());
- auto loc = op.getLoc();
- for (auto it : llvm::zip(op.inputs(), op.init_values())) {
- Value input = std::get<0>(it);
- Value initValue = std::get<1>(it);
- auto inputType = input.getType().cast<ShapedType>();
- auto padResultType =
- RankedTensorType::get(shape, inputType.getElementType());
- auto padOp = rewriter.create<mhlo::PadOp>(
- loc, padResultType, input, initValue, toDenseAttr(paddingLow),
- toDenseAttr(paddingHigh), toDenseAttr(interiorPadding));
- padOps.push_back(padOp);
- }
- auto newOp = rewriter.create<mhlo::ReduceWindowOp>(
- loc, op.getResultTypes(), padOps, op.init_values(),
- op.window_dimensions(), op.window_stridesAttr(),
- op.base_dilationsAttr(), op.window_dilationsAttr(),
- /*padding=*/nullptr);
- rewriter.inlineRegionBefore(op.body(), newOp.body(), newOp.body().begin());
- rewriter.replaceOp(op, newOp.getResults());
- return success();
- }
-};
-
// Adjust the shape of depthwise_conv filter where is applied by mhlo.
class AdjustDepthwiseFilterShape : public OpRewritePattern<mhlo::ConvOp> {
public:
@@ -936,8 +807,7 @@
mhlo::PopulateUnfuseBatchNormPatterns(context, &patterns);
mhlo::PopulateComplexLoweringPatterns(context, &patterns);
mhlo::PopulateGatherToTorchIndexSelectPatterns(context, &patterns);
- patterns.insert<ExtractReduceWindowOpPaddingAttributes,
- AdjustDepthwiseFilterShape, ScatterRank0Value,
+ patterns.insert<AdjustDepthwiseFilterShape, ScatterRank0Value,
ExpandRngNormal, MulCastOfBool>(context);
// dot_general canoncalization patterns.
@@ -987,9 +857,6 @@
ReorderBroadcastInDimOpAndElementwiseOp<mhlo::AndOp>,
ReorderBroadcastInDimOpAndElementwiseOp<mhlo::OrOp>,
ReorderBroadcastInDimOpAndElementwiseOp<mhlo::XorOp>>(context);
- if (extractPadFromConv) {
- patterns.insert<ExtractConvOpPaddingAttributes>(context);
- }
if (orderConvFeatures) {
patterns.insert<ReorderConvOpInputDimensions>(context);
patterns.insert<ReorderConvOpKernelDimensions>(context);
diff --git a/iree/compiler/InputConversion/MHLO/Passes.td b/iree/compiler/InputConversion/MHLO/Passes.td
index aa633a8..4cf16ae 100644
--- a/iree/compiler/InputConversion/MHLO/Passes.td
+++ b/iree/compiler/InputConversion/MHLO/Passes.td
@@ -40,8 +40,6 @@
let summary = "Apply mhlo to mhlo transformations for some mhlo ops";
let constructor = "mlir::iree_compiler::MHLO::createMHLOToMHLOPreprocessingPass()";
let options = [
- Option<"extractPadFromConv", "extract-pad-from-conv", "bool", /*default=*/"true",
- "Extract padding attributes from conv op">,
Option<"orderConvFeatures", "order-conv-features", "bool", /*default=*/"true",
"Guarantees input/output features ordered from conv kernel">
];
diff --git a/iree/compiler/InputConversion/MHLO/test/BUILD b/iree/compiler/InputConversion/MHLO/test/BUILD
index d0fbcbf..8e14a4d 100644
--- a/iree/compiler/InputConversion/MHLO/test/BUILD
+++ b/iree/compiler/InputConversion/MHLO/test/BUILD
@@ -29,7 +29,6 @@
"legalize_input_types.mlir",
"mhlo_to_mhlo_preprocessing.mlir",
"mhlo_to_mhlo_preprocessing_canoncalize_dot_general.mlir",
- "mhlo_to_mhlo_preprocessing_extract_pad_from_conv.mlir",
"missing_legalizations.mlir",
"verify_compiler_mhlo_input_legality.mlir",
],
diff --git a/iree/compiler/InputConversion/MHLO/test/CMakeLists.txt b/iree/compiler/InputConversion/MHLO/test/CMakeLists.txt
index f95c73d..96e60e4 100644
--- a/iree/compiler/InputConversion/MHLO/test/CMakeLists.txt
+++ b/iree/compiler/InputConversion/MHLO/test/CMakeLists.txt
@@ -24,7 +24,6 @@
"legalize_input_types.mlir"
"mhlo_to_mhlo_preprocessing.mlir"
"mhlo_to_mhlo_preprocessing_canoncalize_dot_general.mlir"
- "mhlo_to_mhlo_preprocessing_extract_pad_from_conv.mlir"
"missing_legalizations.mlir"
"verify_compiler_mhlo_input_legality.mlir"
TOOLS
diff --git a/iree/compiler/InputConversion/MHLO/test/fft.mlir b/iree/compiler/InputConversion/MHLO/test/fft.mlir
index b5ec7b6..367269b 100644
--- a/iree/compiler/InputConversion/MHLO/test/fft.mlir
+++ b/iree/compiler/InputConversion/MHLO/test/fft.mlir
@@ -14,12 +14,16 @@
// CHECK-DAG: %[[ImagMatrix:.+]] = arith.constant dense<"0x00000080{{.*}}"> : tensor<32x17xf32>
// CHECK-DAG: %[[Zero:.+]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[RealInit:.+]] = linalg.init_tensor [17] : tensor<17xf32>
-// CHECK: %[[RealFill:.+]] = linalg.fill(%[[Zero]], %[[RealInit]])
+// CHECK: %[[RealFill:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[Zero]] :
+// CHECK-SAME: outs(%[[RealInit]] :
// CHECK: %[[RealRes:.+]] = linalg.vecmat
// CHECK-SAME: ins(%[[Arg0]], %[[RealMatrix]] : tensor<32xf32>, tensor<32x17xf32>)
// CHECK-SAME: outs(%[[RealFill]] : tensor<17xf32>) -> tensor<17xf32>
// CHECK: %[[ImagInit:.+]] = linalg.init_tensor [17] : tensor<17xf32>
-// CHECK: %[[ImagFill:.+]] = linalg.fill(%[[Zero]], %[[ImagInit]])
+// CHECK: %[[ImagFill:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[Zero]] :
+// CHECK-SAME: outs(%[[ImagInit]] :
// CHECK: %[[ImagRes:.+]] = linalg.vecmat
// CHECK-SAME: ins(%[[Arg0]], %[[ImagMatrix]] : tensor<32xf32>, tensor<32x17xf32>)
// CHECK-SAME: outs(%[[ImagFill]] : tensor<17xf32>) -> tensor<17xf32>
@@ -41,12 +45,16 @@
// CHECK-DAG: %[[ImagMatrix:.+]] = arith.constant dense<"0x00000080{{.*}}"> : tensor<32x17xf32>
// CHECK-DAG: %[[Zero:.+]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[RealInit:.+]] = linalg.init_tensor [1, 17] : tensor<1x17xf32>
-// CHECK: %[[RealFill:.+]] = linalg.fill(%[[Zero]], %[[RealInit]])
+// CHECK: %[[RealFill:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[Zero]] :
+// CHECK-SAME: outs(%[[RealInit]] :
// CHECK: %[[RealRes:.+]] = linalg.matmul
// CHECK-SAME: ins(%[[Arg0]], %[[RealMatrix]] : tensor<1x32xf32>, tensor<32x17xf32>)
// CHECK-SAME: outs(%[[RealFill]] : tensor<1x17xf32>) -> tensor<1x17xf32>
// CHECK: %[[ImagInit:.+]] = linalg.init_tensor [1, 17] : tensor<1x17xf32>
-// CHECK: %[[ImagFill:.+]] = linalg.fill(%[[Zero]], %[[ImagInit]])
+// CHECK: %[[ImagFill:.+]] = linalg.fill
+// CHECK-SAME: ins(%[[Zero]] :
+// CHECK-SAME: outs(%[[ImagInit]] :
// CHECK: %[[ImagRes:.+]] = linalg.matmul
// CHECK-SAME: ins(%[[Arg0]], %[[ImagMatrix]] : tensor<1x32xf32>, tensor<32x17xf32>)
// CHECK-SAME: outs(%[[ImagFill]] : tensor<1x17xf32>) -> tensor<1x17xf32>
diff --git a/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir b/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir
index 75a1b3b..e72a219 100644
--- a/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir
+++ b/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir
@@ -58,59 +58,6 @@
// -----
-// CHECK-LABEL: @reduce_window
-func @reduce_window(%input: tensor<1x16x16x64xf32>) -> tensor<1x8x8x64xf32> {
- // CHECK: %[[INITVAL:.+]] = mhlo.constant dense<0xFF800000> : tensor<f32>
- %initval = mhlo.constant dense<0xFF800000> : tensor<f32>
- // CHECK: %[[PAD:.+]] = "mhlo.pad"(%{{.+}}, %[[INITVAL]])
- // CHECK-SAME: edge_padding_high = dense<[0, 1, 1, 0]> : tensor<4xi64>
- // CHECK-SAME: edge_padding_low = dense<[0, 1, 1, 0]> : tensor<4xi64>
- // CHECK: "mhlo.reduce_window"(%[[PAD]], %[[INITVAL]])
- // CHECK-NOT: padding
- %0 = "mhlo.reduce_window"(%input, %initval) ( {
- ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>): // no predecessors
- %3 = mhlo.maximum %arg1, %arg2 : tensor<f32>
- "mhlo.return"(%3) : (tensor<f32>) -> ()
- }) {window_dimensions = dense<[1, 3, 3, 1]> : tensor<4xi64>,
- window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>,
- window_dilations = dense<1> : tensor<4xi64>,
- base_dilations = dense<1> : tensor<4xi64>,
- padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>
- } : (tensor<1x16x16x64xf32>, tensor<f32>) -> tensor<1x8x8x64xf32>
- return %0 : tensor<1x8x8x64xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @reduce_window_variadic
-func @reduce_window_variadic(%input0: tensor<1x16x16x64xf32>, %input1: tensor<1x16x16x64xi32>) -> (tensor<1x8x8x64xf32>, tensor<1x8x8x64xi32>) {
- // CHECK-DAG: %[[INITVAL0:.+]] = mhlo.constant dense<0xFF800000> : tensor<f32>
- // CHECK-DAG: %[[INITVAL1:.+]] = mhlo.constant dense<3> : tensor<i32>
- %initval0 = mhlo.constant dense<0xFF800000> : tensor<f32>
- %initval1 = mhlo.constant dense<3> : tensor<i32>
-
- // CHECK: %[[PAD0:.+]] = "mhlo.pad"(%{{.+}}, %[[INITVAL0]])
- // CHECK-SAME: edge_padding_high = dense<[0, 1, 1, 0]> : tensor<4xi64>
- // CHECK-SAME: edge_padding_low = dense<[0, 1, 1, 0]> : tensor<4xi64>
- // CHECK: %[[PAD1:.+]] = "mhlo.pad"(%{{.+}}, %[[INITVAL1]])
- // CHECK-SAME: edge_padding_high = dense<[0, 1, 1, 0]> : tensor<4xi64>
- // CHECK-SAME: edge_padding_low = dense<[0, 1, 1, 0]> : tensor<4xi64>
- // CHECK: "mhlo.reduce_window"(%[[PAD0]], %[[PAD1]], %[[INITVAL0]], %[[INITVAL1]])
- // CHECK-NOT: padding
- %0:2 = "mhlo.reduce_window"(%input0, %input1, %initval0, %initval1) ( {
- ^bb0(%arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<f32>, %arg4: tensor<i32>): // no predecessors
- %3 = mhlo.maximum %arg1, %arg3 : tensor<f32>
- %4 = mhlo.add %arg2, %arg4 : tensor<i32>
- "mhlo.return"(%3, %4) : (tensor<f32>, tensor<i32>) -> ()
- }) {window_dimensions = dense<[1, 3, 3, 1]> : tensor<4xi64>,
- window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>,
- padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>
- } : (tensor<1x16x16x64xf32>, tensor<1x16x16x64xi32>, tensor<f32>, tensor<i32>) -> (tensor<1x8x8x64xf32>, tensor<1x8x8x64xi32>)
- return %0#0, %0#1 : tensor<1x8x8x64xf32>, tensor<1x8x8x64xi32>
-}
-
-// -----
-
// CHECK: @reorder_broadcast_in_dim_scalar_binary(%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<i32>, %[[ARG3:.*]]: tensor<i32>)
func @reorder_broadcast_in_dim_scalar_binary(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> (tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xf32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xi32>, tensor<1x8x8x64xi32>) {
// CHECK: %[[ADD:.*]] = mhlo.add %[[ARG0]], %[[ARG1]] : tensor<f32>
diff --git a/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing_extract_pad_from_conv.mlir b/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing_extract_pad_from_conv.mlir
deleted file mode 100644
index 427c37f..0000000
--- a/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing_extract_pad_from_conv.mlir
+++ /dev/null
@@ -1,29 +0,0 @@
-// RUN: iree-opt -iree-mhlo-to-mhlo-preprocessing %s | FileCheck %s
-
-// CHECK-LABEL: @conv
-// CHECK: mhlo.pad
-// CHECK-SAME: edge_padding_high = dense<[0, 1, 1, 0]>
-// CHECK-SAME: edge_padding_low = dense<[0, 1, 0, 0]>
-// CHECK: mhlo.convolution
-// CHECK-NOT: padding
-func @conv(%inputs: tensor<1x4x5x2xf32>, %weights: tensor<3x2x2x1xf32>) -> tensor<1x4x5x1xf32> {
- %0 = "mhlo.convolution"(%inputs, %weights) {
- batch_group_count = 1 : i64,
- dimension_numbers = #mhlo.conv<raw
- input_batch_dimension = 0,
- input_feature_dimension = 3,
- input_spatial_dimensions = [1, 2],
- kernel_input_feature_dimension = 2,
- kernel_output_feature_dimension = 3,
- kernel_spatial_dimensions = [0, 1],
- output_batch_dimension = 0,
- output_feature_dimension = 3,
- output_spatial_dimensions = [1, 2]
- >,
- feature_group_count = 1 : i64,
- padding = dense<[[1, 1], [0, 1]]> : tensor<2x2xi64>,
- rhs_dilation = dense<1> : tensor<2xi64>,
- window_strides = dense<1> : tensor<2xi64>} :
- (tensor<1x4x5x2xf32>, tensor<3x2x2x1xf32>) -> tensor<1x4x5x1xf32>
- return %0 : tensor<1x4x5x1xf32>
-}
diff --git a/iree/test/e2e/regression/fill_i64.mlir b/iree/test/e2e/regression/fill_i64.mlir
index 3bd8565..1bc2d21 100644
--- a/iree/test/e2e/regression/fill_i64.mlir
+++ b/iree/test/e2e/regression/fill_i64.mlir
@@ -10,12 +10,12 @@
%cv0 = arith.constant -1 : i64
%v0_init = linalg.init_tensor [%0, %1] : tensor<?x?xi64>
- %v0 = linalg.fill(%cv0, %v0_init) : i64, tensor<?x?xi64> -> tensor<?x?xi64>
+ %v0 = linalg.fill ins(%cv0 : i64) outs(%v0_init : tensor<?x?xi64>) -> tensor<?x?xi64>
// CHECK: 2x3xi64=[-1 -1 -1][-1 -1 -1]
%cv1 = arith.constant 9223372036854775807 : i64
%v1_init = linalg.init_tensor [%0, %1] : tensor<?x?xi64>
- %v1 = linalg.fill(%cv1, %v1_init) : i64, tensor<?x?xi64> -> tensor<?x?xi64>
+ %v1 = linalg.fill ins(%cv1 : i64) outs(%v1_init : tensor<?x?xi64>) -> tensor<?x?xi64>
// CHECK: 2x3xi64=[9223372036854775807 9223372036854775807 9223372036854775807][9223372036854775807 9223372036854775807 9223372036854775807]
return %v0, %v1 : tensor<?x?xi64>, tensor<?x?xi64>
diff --git a/iree/test/e2e/regression/linalg_ops.mlir b/iree/test/e2e/regression/linalg_ops.mlir
index 98173af..c9ec6e1 100644
--- a/iree/test/e2e/regression/linalg_ops.mlir
+++ b/iree/test/e2e/regression/linalg_ops.mlir
@@ -38,7 +38,7 @@
%bias = util.unfoldable_constant dense<1.0> : tensor<16xf32>
%init = linalg.init_tensor [1, 112, 112, 16] : tensor<1x112x112x16xf32>
%cst = arith.constant 0.0 : f32
- %fill = linalg.fill(%cst, %init) : f32, tensor<1x112x112x16xf32> -> tensor<1x112x112x16xf32>
+ %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
%conv = linalg.conv_2d_nhwc_hwcf
{dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%input, %filter : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>)
diff --git a/iree/test/e2e/regression/linalg_quantized_matmul_vs_linalg_matmul.mlir b/iree/test/e2e/regression/linalg_quantized_matmul_vs_linalg_matmul.mlir
index 3378d41..58dfe83 100644
--- a/iree/test/e2e/regression/linalg_quantized_matmul_vs_linalg_matmul.mlir
+++ b/iree/test/e2e/regression/linalg_quantized_matmul_vs_linalg_matmul.mlir
@@ -21,7 +21,7 @@
// compute the sums along rows of %lhs.
%lhs_i32 = arith.extsi %lhs : tensor<3x4xi8> to tensor<3x4xi32>
%init_lhs_sums_uninitialized = linalg.init_tensor [3] : tensor<3xi32>
- %zero_lhs_sums = linalg.fill(%c_0, %init_lhs_sums_uninitialized) : i32, tensor<3xi32> -> tensor<3xi32>
+ %zero_lhs_sums = linalg.fill ins(%c_0 : i32) outs(%init_lhs_sums_uninitialized : tensor<3xi32>) -> tensor<3xi32>
%lhs_sums = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0)>],
@@ -36,7 +36,7 @@
// compute the sums along columns of %rhs.
%rhs_i32 = arith.extsi %rhs : tensor<4x5xi8> to tensor<4x5xi32>
%init_rhs_sums_uninitialized = linalg.init_tensor [5] : tensor<5xi32>
- %zero_rhs_sums = linalg.fill(%c_0, %init_rhs_sums_uninitialized) : i32, tensor<5xi32> -> tensor<5xi32>
+ %zero_rhs_sums = linalg.fill ins(%c_0 : i32) outs(%init_rhs_sums_uninitialized : tensor<5xi32>) -> tensor<5xi32>
%rhs_sums = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d1)>],
@@ -93,7 +93,7 @@
// compute the sums along rows of %lhs.
%lhs_i32 = arith.extsi %lhs : tensor<?x?xi8> to tensor<?x?xi32>
%init_lhs_sums_uninitialized = linalg.init_tensor [%m_size] : tensor<?xi32>
- %zero_lhs_sums = linalg.fill(%c_0, %init_lhs_sums_uninitialized) : i32, tensor<?xi32> -> tensor<?xi32>
+ %zero_lhs_sums = linalg.fill ins(%c_0 : i32) outs(%init_lhs_sums_uninitialized : tensor<?xi32>) -> tensor<?xi32>
%lhs_sums = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0)>],
@@ -108,7 +108,7 @@
// compute the sums along columns of %rhs.
%rhs_i32 = arith.extsi %rhs : tensor<?x?xi8> to tensor<?x?xi32>
%init_rhs_sums_uninitialized = linalg.init_tensor [%n_size] : tensor<?xi32>
- %zero_rhs_sums = linalg.fill(%c_0, %init_rhs_sums_uninitialized) : i32, tensor<?xi32> -> tensor<?xi32>
+ %zero_rhs_sums = linalg.fill ins(%c_0 : i32) outs(%init_rhs_sums_uninitialized : tensor<?xi32>) -> tensor<?xi32>
%rhs_sums = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d1)>],
@@ -192,7 +192,7 @@
%c_plus127 = arith.constant 127 : i32
%init_acc_uninitialized = linalg.init_tensor [3, 5] : tensor<3x5xi32>
- %zero_acc = linalg.fill(%c_0, %init_acc_uninitialized) : i32, tensor<3x5xi32> -> tensor<3x5xi32>
+ %zero_acc = linalg.fill ins(%c_0 : i32) outs(%init_acc_uninitialized : tensor<3x5xi32>) -> tensor<3x5xi32>
// Test special case: both zero points are 0
call @check_one_quantized_matmul_as_matmul_3x4x5(%lhs_3x4_1, %rhs_4x5_1, %c_0, %c_0, %zero_acc) : (tensor<3x4xi8>, tensor<4x5xi8>, i32, i32, tensor<3x5xi32>) -> ()
// Test special cases: one of the zero points is 0
diff --git a/iree/test/e2e/xla_ops/BUILD b/iree/test/e2e/xla_ops/BUILD
index bc109a5..fe58b31 100644
--- a/iree/test/e2e/xla_ops/BUILD
+++ b/iree/test/e2e/xla_ops/BUILD
@@ -469,9 +469,83 @@
target_backend = "dylib-llvm-aot",
)
+# Check host features compilation (LLVM backend with host cpu features).
+iree_check_single_backend_test_suite(
+ name = "check_dylib-llvm-aot-host_dylib",
+ srcs = enforce_glob(
+ # keep sorted
+ [
+ "abs.mlir",
+ "add.mlir",
+ "batch_norm_inference.mlir",
+ "bitcast_convert.mlir",
+ "broadcast.mlir",
+ "broadcast_add.mlir",
+ "broadcast_in_dim.mlir",
+ "clamp.mlir",
+ "compare.mlir",
+ "concatenate.mlir",
+ "constant.mlir",
+ "convert.mlir",
+ "convolution.mlir",
+ "cosine.mlir",
+ "divide.mlir",
+ "dot.mlir",
+ "dot_general.mlir",
+ "dynamic_slice.mlir",
+ "dynamic_update_slice.mlir",
+ "exponential.mlir",
+ "exponential_fp16.mlir",
+ "exponential_minus_one.mlir",
+ "fft.mlir",
+ "finite.mlir",
+ "floor.mlir",
+ "gather.mlir",
+ "iota.mlir",
+ "log.mlir",
+ "log_plus_one.mlir",
+ "maximum.mlir",
+ "minimum.mlir",
+ "multiply.mlir",
+ "negate.mlir",
+ "pad.mlir",
+ "pow.mlir",
+ "reduce.mlir",
+ "reduce_window.mlir",
+ "remainder.mlir",
+ "reshape.mlir",
+ "reverse.mlir",
+ "rng_normal.mlir",
+ "rng_uniform.mlir",
+ "round.mlir",
+ "rsqrt.mlir",
+ "scatter.mlir",
+ "scatter_dynamic.mlir",
+ "select.mlir",
+ "sine.mlir",
+ "slice.mlir",
+ "sort.mlir",
+ "sqrt.mlir",
+ "subtract.mlir",
+ "tanh.mlir",
+ "torch_index_select.mlir",
+ "transpose.mlir",
+ "while.mlir",
+ ],
+ include = ["*.mlir"],
+ ),
+ compiler_flags = [
+ "-iree-input-type=mhlo",
+ "-iree-llvm-target-cpu-features=host",
+ ],
+ driver = "dylib",
+ target_backend = "dylib-llvm-aot",
+)
+
test_suite(
name = "check",
tests = [
+ ":check_dylib-llvm-aot-host_dylib",
":check_dylib-llvm-aot_dylib",
":check_vmvx_vmvx",
":check_vulkan-spirv_vulkan",
diff --git a/iree/test/e2e/xla_ops/CMakeLists.txt b/iree/test/e2e/xla_ops/CMakeLists.txt
index f23f72e..cb97cc6 100644
--- a/iree/test/e2e/xla_ops/CMakeLists.txt
+++ b/iree/test/e2e/xla_ops/CMakeLists.txt
@@ -421,4 +421,73 @@
"-iree-llvm-target-triple=wasm32-unknown-emscripten"
)
+iree_check_single_backend_test_suite(
+ NAME
+ check_dylib-llvm-aot-host_dylib
+ SRCS
+ "abs.mlir"
+ "add.mlir"
+ "batch_norm_inference.mlir"
+ "bitcast_convert.mlir"
+ "broadcast.mlir"
+ "broadcast_add.mlir"
+ "broadcast_in_dim.mlir"
+ "clamp.mlir"
+ "compare.mlir"
+ "concatenate.mlir"
+ "constant.mlir"
+ "convert.mlir"
+ "convolution.mlir"
+ "cosine.mlir"
+ "divide.mlir"
+ "dot.mlir"
+ "dot_general.mlir"
+ "dynamic_slice.mlir"
+ "dynamic_update_slice.mlir"
+ "exponential.mlir"
+ "exponential_fp16.mlir"
+ "exponential_minus_one.mlir"
+ "fft.mlir"
+ "finite.mlir"
+ "floor.mlir"
+ "gather.mlir"
+ "iota.mlir"
+ "log.mlir"
+ "log_plus_one.mlir"
+ "maximum.mlir"
+ "minimum.mlir"
+ "multiply.mlir"
+ "negate.mlir"
+ "pad.mlir"
+ "pow.mlir"
+ "reduce.mlir"
+ "reduce_window.mlir"
+ "remainder.mlir"
+ "reshape.mlir"
+ "reverse.mlir"
+ "rng_normal.mlir"
+ "rng_uniform.mlir"
+ "round.mlir"
+ "rsqrt.mlir"
+ "scatter.mlir"
+ "scatter_dynamic.mlir"
+ "select.mlir"
+ "sine.mlir"
+ "slice.mlir"
+ "sort.mlir"
+ "sqrt.mlir"
+ "subtract.mlir"
+ "tanh.mlir"
+ "torch_index_select.mlir"
+ "transpose.mlir"
+ "while.mlir"
+ TARGET_BACKEND
+ "dylib-llvm-aot"
+ DRIVER
+ "dylib"
+ COMPILER_FLAGS
+ "-iree-input-type=mhlo"
+ "-iree-llvm-target-cpu-features=host"
+)
+
### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/iree/test/e2e/xla_ops/reduce_window.mlir b/iree/test/e2e/xla_ops/reduce_window.mlir
index 08efbc2..24923d6 100644
--- a/iree/test/e2e/xla_ops/reduce_window.mlir
+++ b/iree/test/e2e/xla_ops/reduce_window.mlir
@@ -64,3 +64,20 @@
check.expect_almost_eq_const(%res, dense<[[[[1.0], [4.0]], [[13.0], [14.0]]]]> : tensor<1x2x2x1xf32>) : tensor<1x2x2x1xf32>
return
}
+
+func @reduce_window_max_with_padding_4x6xf32() {
+ %0 = util.unfoldable_constant dense<[[[[ 1.0], [ 2.0], [ 3.0], [ 4.0], [ 5.0], [ 6.0]],
+ [[ 7.0], [ 8.0], [ 9.0], [10.0], [11.0], [12.0]],
+ [[13.0], [14.0], [15.0], [16.0], [17.0], [18.0]],
+ [[19.0], [20.0], [21.0], [22.0], [23.0], [24.0]]]]> : tensor<1x4x6x1xf32>
+ %1 = util.unfoldable_constant dense<0.0> : tensor<f32>
+ %res = "mhlo.reduce_window"(%0, %1) ( {
+ ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>): // no predecessors
+ %3 = "mhlo.maximum"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+ "mhlo.return"(%3) : (tensor<f32>) -> ()
+ }) {window_dimensions = dense<[1, 2, 3, 1]> : tensor<4xi64>,
+ window_strides = dense<[1, 2, 3, 1]> : tensor<4xi64>,
+ padding = dense<[[0, 0], [1, 1], [0, 0], [0, 0]]> : tensor<4x2xi64>} : (tensor<1x4x6x1xf32>, tensor<f32>) -> tensor<1x3x2x1xf32>
+ check.expect_almost_eq_const(%res, dense<[[[[3.0], [6.0]], [[15.0], [18.0]], [[21.0], [24.0]]]]> : tensor<1x3x2x1xf32>) : tensor<1x3x2x1xf32>
+ return
+}
diff --git a/third_party/llvm-project b/third_party/llvm-project
index c38fadb..8361c5d 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit c38fadb7f97e8aca077df9c2a01ec43f8cb1f805
+Subproject commit 8361c5da30588d3d4a48eae648f53be1feb5cfad
diff --git a/third_party/mlir-hlo b/third_party/mlir-hlo
index f52e7dc..7727bff 160000
--- a/third_party/mlir-hlo
+++ b/third_party/mlir-hlo
@@ -1 +1 @@
-Subproject commit f52e7dc1210dead4d3ff416696cad1c794332959
+Subproject commit 7727bfff1a219c9cd60087a1ae0a4b7e52916f57