Merge pull request #7131 from rsuderman:main-to-google
PiperOrigin-RevId: 398345005
diff --git a/.gitignore b/.gitignore
index 6fe9bf2..20f215e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
.pytype/
# Visual Studio files
+.env
.vs/
.vscode/
*.sdf
diff --git a/CMakeLists.txt b/CMakeLists.txt
index de032a8..00a617a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,6 +81,8 @@
option(IREE_BUILD_PYTHON_BINDINGS "Builds the IREE python bindings" OFF)
endif()
+option(IREE_BUILD_OLD_PYTHON_COMPILER_API "Builds the original Python compiler API" ON)
+
#-------------------------------------------------------------------------------
# Experimental project flags
#-------------------------------------------------------------------------------
@@ -438,7 +440,7 @@
enable_testing(iree)
endif()
-if(IREE_BUILD_PYTHON_BINDINGS)
+if(IREE_BUILD_PYTHON_BINDINGS AND NOT pybind11_FOUND)
add_subdirectory(third_party/pybind11 EXCLUDE_FROM_ALL)
endif()
diff --git a/bindings/python/CMakeLists.txt b/bindings/python/CMakeLists.txt
index b80db7b..92a2475 100644
--- a/bindings/python/CMakeLists.txt
+++ b/bindings/python/CMakeLists.txt
@@ -13,7 +13,7 @@
add_subdirectory(iree/runtime)
add_subdirectory(iree/jax)
-if(${IREE_BUILD_COMPILER})
+if(IREE_BUILD_COMPILER AND IREE_BUILD_OLD_PYTHON_COMPILER_API)
add_subdirectory(iree/compiler)
add_subdirectory(iree/tools/core)
endif()
diff --git a/bindings/python/iree/runtime/system_api.py b/bindings/python/iree/runtime/system_api.py
index 3731781..fc82a3b 100644
--- a/bindings/python/iree/runtime/system_api.py
+++ b/bindings/python/iree/runtime/system_api.py
@@ -339,8 +339,7 @@
if backend is not None:
driver = TARGET_BACKEND_TO_DRIVER[backend]
vm_module = _binding.VmModule.from_flatbuffer(vm_flatbuffer)
- config = Config(TARGET_BACKEND_TO_DRIVER[backend])
- bound_module = load_vm_module(vm_module, config)
+ bound_module = load_vm_module(vm_module, Config(driver))
return bound_module
diff --git a/build_tools/bazel/iree.bazelrc b/build_tools/bazel/iree.bazelrc
index cc6ddfb..e987141 100644
--- a/build_tools/bazel/iree.bazelrc
+++ b/build_tools/bazel/iree.bazelrc
@@ -260,7 +260,7 @@
# specific docker container the TF build is run in. The image URL is included
# for clarity and so that this reference is automatically updated by
# manage_images.py
-build:remote_cache_tf_integrations --host_platform_remote_properties_override='properties:{name:"cache-silo-key" value:"gcr.io/iree-oss/cmake-bazel-frontends-swiftshader@sha256:103676490242311b9fad841294689a7ce1c755b935a21d8d898c25cfe3ec15e8"}'
+build:remote_cache_tf_integrations --host_platform_remote_properties_override='properties:{name:"cache-silo-key" value:"gcr.io/iree-oss/cmake-bazel-frontends-swiftshader@sha256:da13c13dc427d9d2bdcba0ee70615088a89b5efd383625d9945eaf2ec17890bd"}'
### Remote Execution ###
# --config=rbe Execute and cache builds remotely.
diff --git a/build_tools/buildkite/samples.yml b/build_tools/buildkite/samples.yml
index f4e3d7c..18750f4 100644
--- a/build_tools/buildkite/samples.yml
+++ b/build_tools/buildkite/samples.yml
@@ -7,7 +7,7 @@
steps:
- label: "Test Colab notebooks"
commands:
- - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/samples@sha256:30fe0cbaf16f8a523319e74ad96de7b8d6d5cd49edc76efdcdc34203fa8ea69f python3 colab/test_notebooks.py"
+ - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/samples@sha256:a2be19816f8bc7aa417423e8ed29a537350cd7e1c9a6f6daa023fc90656fbced python3 colab/test_notebooks.py"
env:
IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
agents:
@@ -15,7 +15,7 @@
- label: "Test Samples"
commands:
- - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/samples@sha256:30fe0cbaf16f8a523319e74ad96de7b8d6d5cd49edc76efdcdc34203fa8ea69f build_tools/testing/test_samples.sh"
+ - "docker run --user=$(id -u):$(id -g) --volume=\\$PWD:\\$IREE_DOCKER_WORKDIR --workdir=\\$IREE_DOCKER_WORKDIR --rm gcr.io/iree-oss/samples@sha256:a2be19816f8bc7aa417423e8ed29a537350cd7e1c9a6f6daa023fc90656fbced build_tools/testing/test_samples.sh"
env:
IREE_DOCKER_WORKDIR: "/usr/src/github/iree"
agents:
diff --git a/build_tools/cmake/iree_macros.cmake b/build_tools/cmake/iree_macros.cmake
index f5049eb..cb90828 100644
--- a/build_tools/cmake/iree_macros.cmake
+++ b/build_tools/cmake/iree_macros.cmake
@@ -215,8 +215,8 @@
# If this file is included in multiple rules, only create the target once.
string(REPLACE "::" "_" _DATA_TARGET ${_DATA_LABEL})
if(NOT TARGET ${_DATA_TARGET})
- set(_INPUT_PATH "${CMAKE_SOURCE_DIR}/${_FILE_PATH}")
- set(_OUTPUT_PATH "${CMAKE_BINARY_DIR}/${_FILE_PATH}")
+ set(_INPUT_PATH "${PROJECT_SOURCE_DIR}/${_FILE_PATH}")
+ set(_OUTPUT_PATH "${PROJECT_BINARY_DIR}/${_FILE_PATH}")
add_custom_target(${_DATA_TARGET}
COMMAND ${CMAKE_COMMAND} -E copy ${_INPUT_PATH} ${_OUTPUT_PATH}
)
diff --git a/build_tools/docker/cmake-bazel-frontends-swiftshader/Dockerfile b/build_tools/docker/cmake-bazel-frontends-swiftshader/Dockerfile
index 773acf4..45ba144 100644
--- a/build_tools/docker/cmake-bazel-frontends-swiftshader/Dockerfile
+++ b/build_tools/docker/cmake-bazel-frontends-swiftshader/Dockerfile
@@ -5,7 +5,7 @@
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
FROM gcr.io/iree-oss/cmake-bazel-frontends-vulkan@sha256:cdf41d7ee7707eb3e79d56f2f1f8bd7e9a0ac3a1122dc4f89f8190154796a6bc AS final
-COPY --from=gcr.io/iree-oss/swiftshader@sha256:3cc8c95c6607891a77dbd01ec06ee4f4bfad26f6ff23a368e44fb6189f51ec02 \
+COPY --from=gcr.io/iree-oss/swiftshader@sha256:1de855baa93acbc2d4dcbf6c5509ecb6d4fa04ea5f5322a1c4faa54ea278e995 \
/swiftshader /swiftshader
# Set VK_ICD_FILENAMES so Vulkan loader can find the SwiftShader ICD.
diff --git a/build_tools/docker/cmake-python-swiftshader/Dockerfile b/build_tools/docker/cmake-python-swiftshader/Dockerfile
index 45946dd..62fc864 100644
--- a/build_tools/docker/cmake-python-swiftshader/Dockerfile
+++ b/build_tools/docker/cmake-python-swiftshader/Dockerfile
@@ -8,7 +8,7 @@
# Vulkan implementation.
FROM gcr.io/iree-oss/cmake-python-vulkan@sha256:894d91b6ddd3435f0e5cb4424a81296438252dae4d8934e84aa6be4d02b81343 AS final
-COPY --from=gcr.io/iree-oss/swiftshader@sha256:3cc8c95c6607891a77dbd01ec06ee4f4bfad26f6ff23a368e44fb6189f51ec02 \
+COPY --from=gcr.io/iree-oss/swiftshader@sha256:1de855baa93acbc2d4dcbf6c5509ecb6d4fa04ea5f5322a1c4faa54ea278e995 \
/swiftshader /swiftshader
# Set VK_ICD_FILENAMES so Vulkan loader can find the SwiftShader ICD.
diff --git a/build_tools/docker/prod_digests.txt b/build_tools/docker/prod_digests.txt
index 579d710..f136d73 100644
--- a/build_tools/docker/prod_digests.txt
+++ b/build_tools/docker/prod_digests.txt
@@ -1,20 +1,20 @@
gcr.io/iree-oss/base@sha256:9b73f4e2b1239f65a19f2022e54f4b15310b805570831fbe2cf8b4dc928f1d10
gcr.io/iree-oss/util@sha256:40846b4aea5886af3250399d6adfdb3e1195a8b0177706bb0375e812d62dc49c
gcr.io/iree-oss/cmake@sha256:9d9953acf5ca0cf1ff3e8de32f10f24dfab1c4e8ec5d1fc047f556024ee4bed6
-gcr.io/iree-oss/swiftshader@sha256:3cc8c95c6607891a77dbd01ec06ee4f4bfad26f6ff23a368e44fb6189f51ec02
+gcr.io/iree-oss/swiftshader@sha256:1de855baa93acbc2d4dcbf6c5509ecb6d4fa04ea5f5322a1c4faa54ea278e995
gcr.io/iree-oss/cmake-python@sha256:51817f1a98f9ed9237577133b4c674b163280fd747c1745d6d0d93f0f2b01fb3
gcr.io/iree-oss/cmake-android@sha256:7d780787608474301e74e1b5cc2a1bfd1304a79ed9e0774c7ed422c0e4a38625
gcr.io/iree-oss/bazel@sha256:5e52c7d43b6fdff35d884b8b8b92b1b6e2151d675019edc92f09018e558e0f94
gcr.io/iree-oss/vulkan@sha256:5812ee64806a7f3df0739ccf0930c27cabce346901488eceb1ee66c9c0a5ae96
gcr.io/iree-oss/rbe-toolchain@sha256:62b161e79413f0f59ae3845c377b10e60a4a639f3d32569a82b620f017837a68
gcr.io/iree-oss/cmake-python-vulkan@sha256:894d91b6ddd3435f0e5cb4424a81296438252dae4d8934e84aa6be4d02b81343
-gcr.io/iree-oss/cmake-python-swiftshader@sha256:f13737fbcf79a2d9a6549e66ec75ecb05a6a35edeb32709a1b4c5f13f0ab955e
+gcr.io/iree-oss/cmake-python-swiftshader@sha256:9311f8a6e2a8bed546bc97f22751bd383d269b38e56e46371fcbd8b2e7944ac5
gcr.io/iree-oss/cmake-python-nvidia@sha256:d96ffdc44026bf112efca82a5150d783e8eba8976c7bc150863ec5868de40778
gcr.io/iree-oss/cmake-bazel-frontends@sha256:7633ff2b483a07b6c786ffab40cca7cae64d6a211ad0e95ff55d3f1cd2dd1ea9
gcr.io/iree-oss/cmake-bazel-frontends-vulkan@sha256:cdf41d7ee7707eb3e79d56f2f1f8bd7e9a0ac3a1122dc4f89f8190154796a6bc
gcr.io/iree-oss/cmake-bazel-frontends-nvidia@sha256:7a2189f9c2c5491878fdf6d38ddab18832020a06285eeff31b8376b9415fb7e9
-gcr.io/iree-oss/cmake-bazel-frontends-swiftshader@sha256:26fbfb5e4813a2d118b68694398de950c808228757a79f2a8909dd332792d9d3
+gcr.io/iree-oss/cmake-bazel-frontends-swiftshader@sha256:da13c13dc427d9d2bdcba0ee70615088a89b5efd383625d9945eaf2ec17890bd
gcr.io/iree-oss/cmake-riscv@sha256:95489593bc9b0cd325ce9c1a32b47389c01b174a5b8190a16d937d2e8828d384
gcr.io/iree-oss/cmake-bazel-frontends-android@sha256:cdb1b38772643f7acbc296b558ccc868900a47a1378cf63da3fbe469dcf42428
-gcr.io/iree-oss/samples@sha256:30fe0cbaf16f8a523319e74ad96de7b8d6d5cd49edc76efdcdc34203fa8ea69f
+gcr.io/iree-oss/samples@sha256:a2be19816f8bc7aa417423e8ed29a537350cd7e1c9a6f6daa023fc90656fbced
gcr.io/iree-oss/cmake-emscripten@sha256:8acad361d23cb586187c2ea29df3a1ab301b5283c3648beb328681d69ecd0ab0
diff --git a/build_tools/docker/rbe-toolchain/Dockerfile b/build_tools/docker/rbe-toolchain/Dockerfile
index f7df137..66c615e 100644
--- a/build_tools/docker/rbe-toolchain/Dockerfile
+++ b/build_tools/docker/rbe-toolchain/Dockerfile
@@ -34,7 +34,7 @@
python3 \
zlib1g-dev
-ARG SWIFTSHADER_COMMIT=e32890c54793664e21891a504793cc8c0c7a34fb
+ARG SWIFTSHADER_COMMIT=755b78dc66b2362621a78b6964a9df3af94e960c
RUN git clone https://github.com/google/swiftshader
RUN cd swiftshader && git checkout "${SWIFTSHADER_COMMIT?}" && cd ..
diff --git a/build_tools/docker/samples/Dockerfile b/build_tools/docker/samples/Dockerfile
index 403d5d3..303aa44 100644
--- a/build_tools/docker/samples/Dockerfile
+++ b/build_tools/docker/samples/Dockerfile
@@ -9,7 +9,7 @@
# * Vulkan (using SwiftShader)
# * Python (including `venv` and common pip packages needed for Colab)
-FROM gcr.io/iree-oss/cmake-python-swiftshader@sha256:f13737fbcf79a2d9a6549e66ec75ecb05a6a35edeb32709a1b4c5f13f0ab955e AS final
+FROM gcr.io/iree-oss/cmake-python-swiftshader@sha256:9311f8a6e2a8bed546bc97f22751bd383d269b38e56e46371fcbd8b2e7944ac5 AS final
# Update setuptools per https://github.com/pypa/setuptools/issues/1694#issuecomment-466010982
RUN apt-get update && apt-get install -y python3-venv python3.7-venv python-setuptools && \
diff --git a/build_tools/docker/swiftshader/Dockerfile b/build_tools/docker/swiftshader/Dockerfile
index f588924..9ffd4c2 100644
--- a/build_tools/docker/swiftshader/Dockerfile
+++ b/build_tools/docker/swiftshader/Dockerfile
@@ -9,7 +9,7 @@
RUN apt-get update && apt-get install -y git
-ARG SWIFTSHADER_COMMIT=e32890c54793664e21891a504793cc8c0c7a34fb
+ARG SWIFTSHADER_COMMIT=755b78dc66b2362621a78b6964a9df3af94e960c
# zlib is needed for compiling SwiftShader.
RUN apt-get update && apt-get install -y zlib1g-dev
diff --git a/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build_kokoro.sh
index 8d13c4c..da102bd 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build_kokoro.sh
@@ -24,7 +24,7 @@
docker_setup
docker run "${DOCKER_RUN_ARGS[@]?}" \
- gcr.io/iree-oss/cmake-bazel-frontends-swiftshader@sha256:26fbfb5e4813a2d118b68694398de950c808228757a79f2a8909dd332792d9d3 \
+ gcr.io/iree-oss/cmake-bazel-frontends-swiftshader@sha256:da13c13dc427d9d2bdcba0ee70615088a89b5efd383625d9945eaf2ec17890bd \
build_tools/kokoro/gcp_ubuntu/cmake-bazel/linux/x86-swiftshader/build.sh
# Kokoro will rsync this entire directory back to the executor orchestrating the
diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build_kokoro.sh
index e53e033..728b3ef 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader-asan/build_kokoro.sh
@@ -24,7 +24,7 @@
docker_setup
docker run "${DOCKER_RUN_ARGS[@]?}" \
- gcr.io/iree-oss/cmake-python-swiftshader@sha256:f13737fbcf79a2d9a6549e66ec75ecb05a6a35edeb32709a1b4c5f13f0ab955e \
+ gcr.io/iree-oss/cmake-python-swiftshader@sha256:9311f8a6e2a8bed546bc97f22751bd383d269b38e56e46371fcbd8b2e7944ac5 \
build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build.sh
# Kokoro will rsync this entire directory back to the executor orchestrating the
diff --git a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh
index e53e033..728b3ef 100755
--- a/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh
+++ b/build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build_kokoro.sh
@@ -24,7 +24,7 @@
docker_setup
docker run "${DOCKER_RUN_ARGS[@]?}" \
- gcr.io/iree-oss/cmake-python-swiftshader@sha256:f13737fbcf79a2d9a6549e66ec75ecb05a6a35edeb32709a1b4c5f13f0ab955e \
+ gcr.io/iree-oss/cmake-python-swiftshader@sha256:9311f8a6e2a8bed546bc97f22751bd383d269b38e56e46371fcbd8b2e7944ac5 \
build_tools/kokoro/gcp_ubuntu/cmake/linux/x86-swiftshader/build.sh
# Kokoro will rsync this entire directory back to the executor orchestrating the
diff --git a/build_tools/third_party/swiftshader/build_vk_swiftshader.sh b/build_tools/third_party/swiftshader/build_vk_swiftshader.sh
index 9289e62..9e466c1 100755
--- a/build_tools/third_party/swiftshader/build_vk_swiftshader.sh
+++ b/build_tools/third_party/swiftshader/build_vk_swiftshader.sh
@@ -54,7 +54,7 @@
SWIFTSHADER_INSTALL_DIR="${BASE_DIR?}"'\.swiftshader'
fi
-SWIFTSHADER_COMMIT=e32890c54793664e21891a504793cc8c0c7a34fb
+SWIFTSHADER_COMMIT=755b78dc66b2362621a78b6964a9df3af94e960c
SWIFTSHADER_DIR="$(mktemp --directory --tmpdir swiftshader_XXXXXX)"
# Clone swiftshader and checkout the appropriate commit.
diff --git a/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 5d2ddb0..cb554f2 100644
--- a/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -221,7 +221,41 @@
IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUVectorize, workgroupSize);
}
+/// Propagate the configuration annotated in the incoming IR.
+static LogicalResult setUserConfig(FuncOp entryPointFn, Operation *computeOp,
+ IREE::HAL::LoweringConfig config) {
+ IREE::HAL::DispatchLoweringPassPipeline passPipeline =
+ IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUVectorize;
+ if (auto setPassPipeline = getLoweringPassPipeline(config)) {
+ passPipeline = setPassPipeline.getValue();
+ }
+ SmallVector<int64_t, 4> workgroupSize;
+ if (auto workgroupSizeAttr = config.workgroupSize()) {
+ workgroupSize = llvm::to_vector<4>(
+ llvm::map_range(workgroupSizeAttr, [](Attribute intAttr) {
+ return intAttr.cast<IntegerAttr>().getInt();
+ }));
+ }
+ if (failed(setOpConfigAndEntryPointFnTranslation(
+ entryPointFn, computeOp, config, passPipeline, workgroupSize))) {
+ return failure();
+ }
+ // Reset the op configuration to drop the pass-pipeline and workgroup size
+ // info. The op does not carry that information anymore.
+ auto resetConfig = IREE::HAL::LoweringConfig::get(
+ config.tileSizes(), config.nativeVectorSize(),
+ /*passPipeline =*/nullptr,
+ /*workgroupSize =*/nullptr, computeOp->getContext());
+ setLoweringConfig(computeOp, resetConfig);
+ return success();
+}
+
static LogicalResult setRootConfig(FuncOp entryPointFn, Operation *computeOp) {
+ if (IREE::HAL::LoweringConfig config = getLoweringConfig(computeOp)) {
+ // If the op already has a lowering config coming from the IR use this and
+ // bypass the heuristic.
+ return setUserConfig(entryPointFn, computeOp, config);
+ }
if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) {
if (linalg::isaContractionOpInterface(linalgOp) &&
linalgOp.getNumParallelLoops() >= 2) {
diff --git a/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
index 3665903..07cd247 100644
--- a/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
+++ b/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
@@ -364,3 +364,65 @@
// CHECK: func @static_3d_fft_stage3()
// CHECK: linalg_ext.fft
// CHECK-SAME: lowering.config = #[[CONFIG]]
+
+// -----
+
+hal.executable @user_config {
+hal.executable.variant public @cuda_nvptx_fb, target = #hal.executable.target<"cuda", "cuda-nvptx-fb"> {
+ hal.executable.entry_point public @_lowering_config_test_dispatch_1 attributes {interface = @io, ordinal = 0 : index}
+ builtin.module {
+ func @_lowering_config_test_dispatch_1() {
+ %cst = constant 0.000000e+00 : f32
+ %c128 = constant 128 : index
+ %c1024 = constant 1024 : index
+ %c0 = constant 0 : index
+ %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:128x256xf32>
+ %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:256x1024xf32>
+ %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:128x1024xf32>
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+ %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+ scf.for %arg0 = %3 to %c128 step %4 {
+ %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+ %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+ scf.for %arg1 = %5 to %c1024 step %6 {
+ %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 128)>(%arg0)[%workgroup_size_y]
+ %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:128x256xf32> -> tensor<?x256xf32>
+ %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1024)>(%arg1)[%workgroup_size_x]
+ %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [256, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:256x1024xf32> -> tensor<256x?xf32>
+ %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 128)>(%arg0)[%workgroup_size_y]
+ %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1024)>(%arg1)[%workgroup_size_x]
+ %13 = affine.min affine_map<(d0)[s0] -> (-d0 + 128, s0)>(%arg0)[%workgroup_size_y]
+ %14 = affine.min affine_map<(d0)[s0] -> (-d0 + 1024, s0)>(%arg1)[%workgroup_size_x]
+ %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
+ %16 = linalg.fill(%cst, %15) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = {passPipeline = "LLVMGPUMatmulSimt", tileSizes = [[32, 256, 64], [], [4, 16]], workgroupSize = [16, 8, 1]}} ins(%8, %10 : tensor<?x256xf32>, tensor<256x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:128x1024xf32>
+ }
+ }
+ return
+ }
+ hal.interface private @io {
+ hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ }
+}
+}
+
+// CHECK-DAG: #[[CONFIG:.+]] = {{{.*}}tileSizes = {{\[}}[32, 256, 64], [], [4, 16]{{\]}}}
+// CHECK: hal.executable.entry_point public @_lowering_config_test_dispatch_1
+// CHECK-SAME: passPipeline = "LLVMGPUMatmulSimt"
+// CHECK-SAME: workloadPerWorkgroup = [256, 32]
+// CHECK-SAME: workgroup_size = [16 : index, 8 : index, 1 : index]
+// CHECK: func @_lowering_config_test_dispatch_1
+// CHECK: linalg.fill
+// CHECK-SAME: lowering.config = #[[CONFIG]]
+// CHECK: linalg.matmul
+// CHECK-SAME: lowering.config = #[[CONFIG]]
diff --git a/iree/compiler/Codegen/SPIRV/AdrenoConfig.cpp b/iree/compiler/Codegen/SPIRV/AdrenoConfig.cpp
index 37fac74..38240a8 100644
--- a/iree/compiler/Codegen/SPIRV/AdrenoConfig.cpp
+++ b/iree/compiler/Codegen/SPIRV/AdrenoConfig.cpp
@@ -13,9 +13,6 @@
#include <array>
#include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
-#include "iree/compiler/Codegen/Transforms/Transforms.h"
-#include "iree/compiler/Codegen/Utils/Utils.h"
-#include "llvm/Support/MathExtras.h"
#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
namespace mlir {
@@ -23,100 +20,6 @@
namespace detail {
//===----------------------------------------------------------------------===//
-// Matmul
-//===----------------------------------------------------------------------===//
-
-static LogicalResult setOpConfig(linalg::LinalgOp op) {
- ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
- ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);
- if (llvm::any_of(lhsShape, ShapedType::isDynamic)) return success();
- if (llvm::any_of(rhsShape, ShapedType::isDynamic)) return success();
-
- bool isBM = isa<linalg::BatchMatmulOp>(op);
-
- int64_t dimM = lhsShape[0 + isBM];
- int64_t dimK = lhsShape[1 + isBM];
- int64_t dimN = rhsShape[1 + isBM];
-
- // The core idea is to distribute the matmul M/N dimension to the workgroup
- // Y/X dimension, with each thread in a workgroup handling multiple vector
- // elements. We start from the best (X, Y) and the tiling sizes for (M, N, K)
- // and try different configurations by scaling them down until we find a
- // configuration that can perfectly tile the input matmul.
-
- const int64_t bestX = 32, bestY = 2;
- const int64_t bestThreadM = 8, bestThreadN = 8, bestThreadK = 4;
-
- int64_t residualThreads = bestX * bestY;
- int64_t residualTilingFactor = (bestThreadM + bestThreadK) * bestThreadN;
-
- SmallVector<int64_t, 3> workgroupSize(3, 1); // (X, Y, Z)
- SmallVector<int64_t, 4> workgroupTileSizes(3 + isBM, 0); // (B, M, N, K)
- SmallVector<int64_t, 4> invocationTileSizes(3 + isBM, 0); // (B, M, N, K)
-
- if (isBM) workgroupTileSizes[0] = invocationTileSizes[0] = 1;
-
- // Deduce the configuration for the N dimension. Start with the best workgroup
- // X size, and reduce by a factor of two each time.
- for (int64_t x = bestX; x >= 2; x >>= 1) {
- // Handle 4 elements per thread for the innermost dimension. We need this
- // for vectorized load.
- int64_t chosenTileSize = 4;
- if (dimN % (x * chosenTileSize) == 0) {
- workgroupSize[0] = x;
- workgroupTileSizes[1 + isBM] = x * chosenTileSize;
- invocationTileSizes[1 + isBM] = chosenTileSize;
- residualThreads /= x;
- assert(residualTilingFactor % chosenTileSize == 0);
- residualTilingFactor /= chosenTileSize;
- break;
- }
- }
- if (workgroupTileSizes[1 + isBM] == 0) return success();
-
- // Deduce the configuration for the M dimension. Start with the best workgroup
- // Y size, and reduce by a factor of two each time.
- for (int64_t y = residualThreads; y >= 1; y >>= 1) {
- int64_t chosenTileSize = 0;
- // Reduce the thread tiling size by one each time. We read one row each
- // time; so it's fine to not be some power of two here.
- for (int64_t t = bestThreadM; t >= 1; --t) {
- if (dimM % (y * t) == 0) {
- chosenTileSize = t;
- break;
- }
- }
- if (chosenTileSize) {
- workgroupSize[1] = y;
- workgroupTileSizes[0 + isBM] = y * chosenTileSize;
- invocationTileSizes[0 + isBM] = chosenTileSize;
- assert(residualTilingFactor > chosenTileSize);
- residualTilingFactor -= chosenTileSize;
- break;
- }
- }
- if (workgroupTileSizes[0 + isBM] == 0) return success();
-
- // Deduce the configuration for the K dimension. We need some power of two
- // here so that we can do vector load.
- for (int64_t t = llvm::PowerOf2Floor(residualTilingFactor); t >= 1; t >>= 1) {
- if (dimK % t == 0) {
- workgroupTileSizes[2 + isBM] = invocationTileSizes[2 + isBM] = t;
- break;
- }
- }
-
- auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
- TileSizesListType tileSizes;
- tileSizes.push_back(workgroupTileSizes);
- tileSizes.emplace_back();
- tileSizes.push_back(invocationTileSizes);
- return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
- op, tileSizes, {}, pipeline,
- workgroupSize);
-}
-
-//===----------------------------------------------------------------------===//
// Entry Point
//===----------------------------------------------------------------------===//
@@ -124,8 +27,11 @@
Operation *rootOp) {
int64_t subgroupSize = targetEnv.getResourceLimits().subgroup_size().getInt();
return TypeSwitch<Operation *, LogicalResult>(rootOp)
- .Case<linalg::BatchMatmulOp, linalg::MatmulOp>(
- [](auto op) { return setOpConfig(op); })
+ .Case<linalg::BatchMatmulOp, linalg::MatmulOp>([](auto op) {
+ std::array<int64_t, 2> workgroupXY = {32, 2};
+ std::array<int64_t, 3> threadMNK = {16, 4, 4};
+ return setMatmulOpConfig(op, workgroupXY, threadMNK);
+ })
.Case<linalg::Conv2DNhwcHwcfOp>([subgroupSize](auto op) {
return setConvOpConfig(op, subgroupSize,
/*bestTilingFactor=*/32);
diff --git a/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
index d8cd080..fa57109 100644
--- a/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
+++ b/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
@@ -29,21 +29,6 @@
// Utilities
//===----------------------------------------------------------------------===//
-/// Given `nprocs`, tries to distribute it evenly across 2 logical dimensions.
-static std::tuple<int64_t, int64_t> distributeProcs2D(int64_t nprocs) {
- int64_t nprocs_x = std::max<int64_t>(
- 1, static_cast<int64_t>(
- llvm::PowerOf2Ceil(static_cast<uint64_t>(std::sqrt(nprocs)))));
- return std::make_tuple(nprocs_x, nprocs / nprocs_x);
-}
-
-/// Returns the minimum of `shape` and `tileSize` if shape is static.
-/// Returns `tileSize` otherwise.
-int64_t getMinIfStaticShape(int64_t shape, int64_t tileSize) {
- if (shape == ShapedType::kDynamicSize) return tileSize;
- return std::min(shape, tileSize);
-}
-
/// Defines the workgroup count region on entry point ops for the
/// `SPIRVDistributeToGlobalID` pipeline.
// TODO(ravishankarm): Remove this when that pipeline is deprecated.
@@ -218,79 +203,108 @@
// Matmul Default Configuration
//===----------------------------------------------------------------------===//
-static LogicalResult setOpConfig(spirv::ResourceLimitsAttr limits,
- linalg::BatchMatmulOp op) {
- unsigned maxWorkgroupSize =
- limits.max_compute_workgroup_invocations().getInt();
+namespace detail {
- // This is just being hard-wired for now to be minimal viable, but this can be
- // decided better when we have better estimates of device charecteristics.
- const int64_t numRowsPerThread = 1;
- const int64_t numColsPerThread = 1;
- const int64_t numBatchesPerThread = 1;
- const int64_t tileSizeK = 0;
-
- std::array<int64_t, 3> workgroupSize = {1, 1, 1};
- std::tie(workgroupSize[0], workgroupSize[1]) =
- distributeProcs2D(maxWorkgroupSize);
-
- auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute;
-
- TileSizesListType tileSizes;
- // Workgroup level.
- tileSizes.push_back({numBatchesPerThread, numRowsPerThread * workgroupSize[1],
- numColsPerThread * workgroupSize[0], tileSizeK});
- // No tiling at the subgroup level since this target doesn't use subgroup op
- // or shared memory.
- tileSizes.emplace_back();
- // Invocation level.
- tileSizes.push_back(
- {numBatchesPerThread, numRowsPerThread, numColsPerThread, 0});
-
- return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
- op, tileSizes, {}, pipeline,
- workgroupSize);
-}
-
-static LogicalResult setOpConfig(spirv::ResourceLimitsAttr limits,
- linalg::MatmulOp op) {
- unsigned maxWorkgroupSize =
- limits.max_compute_workgroup_invocations().getInt();
-
- std::array<int64_t, 3> workgroupSize = {1, 1, 1};
- std::tie(workgroupSize[0], workgroupSize[1]) =
- distributeProcs2D(maxWorkgroupSize);
-
- const int numRowsPerThread = 1;
- const int numColsPerThread = 1;
- int64_t tileSizeK = 0;
-
+LogicalResult setMatmulOpConfig(linalg::LinalgOp op,
+ std::array<int64_t, 2> bestWorkgroupSizeXY,
+ std::array<int64_t, 3> bestThreadTileSizeMNK) {
ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);
+ if (llvm::any_of(lhsShape, ShapedType::isDynamic)) return success();
+ if (llvm::any_of(rhsShape, ShapedType::isDynamic)) return success();
- int64_t M = lhsShape[0];
- int64_t N = rhsShape[1];
- int64_t K = lhsShape[1];
+ bool isBM = isa<linalg::BatchMatmulOp>(op);
- auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute;
+ int64_t dimM = lhsShape[0 + isBM];
+ int64_t dimK = lhsShape[1 + isBM];
+ int64_t dimN = rhsShape[1 + isBM];
+ // The core idea is to distribute the matmul M/N dimension to the workgroup
+ // Y/X dimension, with each thread in a workgroup handling multiple vector
+ // elements. We start from the best (X, Y) and the tiling sizes for (M, N, K)
+ // and try different configurations by scaling them down until we find a
+ // configuration that can perfectly tile the input matmul.
+
+ const int64_t bestX = bestWorkgroupSizeXY[0], bestY = bestWorkgroupSizeXY[1];
+ const int64_t bestThreadM = bestThreadTileSizeMNK[0],
+ bestThreadN = bestThreadTileSizeMNK[1],
+ bestThreadK = bestThreadTileSizeMNK[2];
+
+ int64_t residualThreads = bestX * bestY;
+ int64_t residualTilingFactor = (bestThreadM + bestThreadK) * bestThreadN;
+
+ SmallVector<int64_t, 3> workgroupSize(3, 1); // (X, Y, Z)
+ SmallVector<int64_t, 4> workgroupTileSizes(3 + isBM, 0); // (B, M, N, K)
+ SmallVector<int64_t, 4> invocationTileSizes(3 + isBM, 0); // (B, M, N, K)
+
+ if (isBM) workgroupTileSizes[0] = invocationTileSizes[0] = 1;
+
+ // Deduce the configuration for the N dimension. Start with the best workgroup
+ // X size, and reduce by a factor of two each time.
+ for (int64_t x = bestX; x >= 2; x >>= 1) {
+ // Handle 4 elements per thread for the innermost dimension. We need this
+ // for vectorized load.
+ int64_t chosenTileSize = bestThreadN;
+ if (dimN % (x * chosenTileSize) == 0) {
+ workgroupSize[0] = x;
+ workgroupTileSizes[1 + isBM] = x * chosenTileSize;
+ invocationTileSizes[1 + isBM] = chosenTileSize;
+ residualThreads /= x;
+ assert(residualTilingFactor % chosenTileSize == 0);
+ residualTilingFactor /= chosenTileSize;
+ break;
+ }
+ }
+ if (workgroupTileSizes[1 + isBM] == 0) return success();
+
+ // Deduce the configuration for the M dimension. Start with the best workgroup
+ // Y size, and reduce by a factor of two each time.
+ for (int64_t y = residualThreads; y >= 1; y >>= 1) {
+ int64_t chosenTileSize = 0;
+ // Reduce the thread tiling size by one each time. We read one row each
+ // time; so it's fine to not be some power of two here.
+ for (int64_t t = bestThreadM; t >= 1; --t) {
+ if (dimM % (y * t) == 0) {
+ chosenTileSize = t;
+ break;
+ }
+ }
+ if (chosenTileSize) {
+ workgroupSize[1] = y;
+ workgroupTileSizes[0 + isBM] = y * chosenTileSize;
+ invocationTileSizes[0 + isBM] = chosenTileSize;
+ assert(residualTilingFactor > chosenTileSize);
+ residualTilingFactor -= chosenTileSize;
+ break;
+ }
+ }
+ if (workgroupTileSizes[0 + isBM] == 0) return success();
+
+ // Deduce the configuration for the K dimension. We need some power of two
+ // here so that we can do vector load.
+ for (int64_t t = llvm::PowerOf2Floor(residualTilingFactor); t >= 1; t >>= 1) {
+ if (dimK % t == 0) {
+ workgroupTileSizes[2 + isBM] = invocationTileSizes[2 + isBM] = t;
+ break;
+ }
+ }
+
+ auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
TileSizesListType tileSizes;
- // Workgroup level.
- tileSizes.push_back(
- {getMinIfStaticShape(M, numRowsPerThread * workgroupSize[1]),
- getMinIfStaticShape(N, numColsPerThread * workgroupSize[0]),
- getMinIfStaticShape(K, tileSizeK)});
- // No tiling at the subgroup level since this target doesn't use subgroup op
- // or shared memory.
+ tileSizes.push_back(workgroupTileSizes);
tileSizes.emplace_back();
- // Invocation level.
- tileSizes.push_back({1, 1, 0});
-
+ tileSizes.push_back(invocationTileSizes);
return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
op, tileSizes, {}, pipeline,
workgroupSize);
}
+} // namespace detail
+
+//===----------------------------------------------------------------------===//
+// FFT Default Configuration
+//===----------------------------------------------------------------------===//
+
static LogicalResult setOpConfig(spirv::ResourceLimitsAttr limits,
linalg_ext::FftOp op) {
const int64_t subgroupSize = limits.subgroup_size().getValue().getSExtValue();
@@ -441,10 +455,22 @@
// Check whether there is actually a configuration found. If so, it's done.
if (getLoweringConfig(rootOp)) return result;
- // Otherwise fallback to use a default configuration.
+ // Otherwise fallback to use a default configuration that tiles and
+ // distributes/vectorizes.
spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
return TypeSwitch<Operation *, LogicalResult>(rootOp)
- .Case<linalg::BatchMatmulOp, linalg::MatmulOp, linalg_ext::FftOp>(
+ .Case<linalg::BatchMatmulOp, linalg::MatmulOp>([limits](auto op) {
+ // Try to tile and vectorize first.
+ std::array<int64_t, 2> workgroupXY = {32, 2};
+ std::array<int64_t, 3> threadMNK = {8, 8, 4};
+ auto result = detail::setMatmulOpConfig(op, workgroupXY, threadMNK);
+ if (failed(result)) return result;
+ if (getLoweringConfig(op)) return result;
+
+ // If unsuccessful, try to tile and distribute.
+ return setDefaultOpConfig(limits, op);
+ })
+ .Case<linalg_ext::FftOp>(
[limits](auto op) { return setOpConfig(limits, op); })
.Case<linalg::Conv2DNhwcHwcfOp, linalg::DepthwiseConv2DNhwOp>(
[limits](auto op) {
diff --git a/iree/compiler/Codegen/SPIRV/KernelConfig.h b/iree/compiler/Codegen/SPIRV/KernelConfig.h
index 51838ee..c0d4f31 100644
--- a/iree/compiler/Codegen/SPIRV/KernelConfig.h
+++ b/iree/compiler/Codegen/SPIRV/KernelConfig.h
@@ -33,6 +33,12 @@
const int64_t subgroupSize,
const int64_t bestTilingFactor);
+/// Sets CodeGen configurations via attributes to the given matmul `linalgOp`
+/// with the given best workgroup size and tile size hints.
+LogicalResult setMatmulOpConfig(linalg::LinalgOp linalgOp,
+ std::array<int64_t, 2> bestWorkgroupSizeXY,
+ std::array<int64_t, 3> bestThreadTileSizeMNK);
+
/// Sets CodeGen configuration for GPUs from a specific vendor.
///
/// If the given `rootOp` has known good CodeGen configuration, attaches a
diff --git a/iree/compiler/Codegen/SPIRV/MaliConfig.cpp b/iree/compiler/Codegen/SPIRV/MaliConfig.cpp
index 4f01291..1c19955 100644
--- a/iree/compiler/Codegen/SPIRV/MaliConfig.cpp
+++ b/iree/compiler/Codegen/SPIRV/MaliConfig.cpp
@@ -13,166 +13,12 @@
#include <array>
#include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
-#include "iree/compiler/Codegen/Transforms/Transforms.h"
-#include "iree/compiler/Codegen/Utils/Utils.h"
#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
namespace mlir {
namespace iree_compiler {
namespace detail {
-struct TileWorkgroupSizePair {
- // How many scalar elements each workgroup should handle along each dimension.
- std::array<int64_t, 3> tileSize;
- // The number of threads per workgroup along each dimension.
- std::array<int64_t, 3> workgroupSize;
-};
-
-//===----------------------------------------------------------------------===//
-// Matmul
-//===----------------------------------------------------------------------===//
-
-/// Writes preferred matmul workgroup tile sizes and workgroup size into
-/// `pairs` for the given matmul `scale` (MxNxK) and `elementType`.
-static void getMatmulTileAndWorkgroupSizes(
- int64_t scale, Type elementType,
- SmallVectorImpl<TileWorkgroupSizePair> &pairs) {
- if (elementType.isF16()) {
- const int64_t smallMatrixSizeThreshold = 512 * 512;
- // For smaller destination size we cannot fill out the GPU with bigger tile
- // sizes. Instead we pick smaller tiles along M and N to increase the number
- // of workgroups and a larger K tile size since we have lower pressure and
- // need extra instructions to hide latency.
- // TODO: The threshold needs to be fine tuned by doing exploration based on
- // matrix shapes.
- if (scale <= smallMatrixSizeThreshold) {
- pairs.push_back(TileWorkgroupSizePair({{16, 32, 16}, {8, 2, 1}}));
- } else {
- pairs.push_back(TileWorkgroupSizePair({{16, 64, 4}, {8, 2, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{8, 128, 4}, {8, 2, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{16, 32, 4}, {8, 2, 1}}));
- }
- return;
- }
-
- // TODO: Heuristic picked based on MobileNet performance. We need
- // auto-tuning to be able to make a smarter choice.
- const int64_t smallMatrixSizeThreshold = 20000;
-
- if (scale <= smallMatrixSizeThreshold) {
- pairs.push_back(TileWorkgroupSizePair({{4, 32, 16}, {8, 2, 1}}));
- }
- pairs.push_back(TileWorkgroupSizePair({{12, 32, 4}, {8, 2, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{14, 32, 4}, {8, 2, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{10, 32, 4}, {8, 2, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{7, 64, 4}, {16, 1, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{8, 32, 4}, {8, 2, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{6, 32, 4}, {8, 2, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{24, 16, 4}, {2, 8, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{16, 16, 4}, {2, 8, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{24, 8, 4}, {2, 8, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{40, 8, 4}, {2, 8, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{32, 8, 4}, {2, 8, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{16, 8, 4}, {2, 8, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{1, 32, 16}, {8, 1, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{1, 32, 8}, {8, 1, 1}}));
- pairs.push_back(TileWorkgroupSizePair({{1, 32, 4}, {8, 1, 1}}));
-}
-
-/// Launch configuration for Mali GPU configuration.
-static LogicalResult setOpConfig(linalg::BatchMatmulOp op) {
- ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
- ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);
-
- if (llvm::any_of(lhsShape, ShapedType::isDynamic) ||
- llvm::any_of(rhsShape, ShapedType::isDynamic)) {
- return success();
- }
-
- // Get a vector of best tile size ordered from best to worst.
- Type elementType =
- op.inputs()[0].getType().cast<ShapedType>().getElementType();
- int64_t matmulScale = lhsShape[0] * lhsShape[1] * rhsShape[2];
- SmallVector<TileWorkgroupSizePair, 4> pairs;
- getMatmulTileAndWorkgroupSizes(matmulScale, elementType, pairs);
-
- for (TileWorkgroupSizePair pair : pairs) {
- if (lhsShape[1] % pair.tileSize[0] != 0 ||
- rhsShape[2] % pair.tileSize[1] != 0 ||
- lhsShape[2] % pair.tileSize[2] != 0) {
- continue;
- }
-
- auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
-
- SmallVector<int64_t, 4> numElementsPerWorkgroup;
- numElementsPerWorkgroup = {1, pair.tileSize[0], pair.tileSize[1],
- pair.tileSize[2]};
-
- TileSizesListType tileSizes;
- // Workgroup level.
- tileSizes.push_back(numElementsPerWorkgroup);
- // No tiling at the subgroup level since this target doesn't use subgroup op
- // or shared memory.
- tileSizes.emplace_back();
- // Invocation level.
- tileSizes.push_back({numElementsPerWorkgroup[0],
- numElementsPerWorkgroup[1] / pair.workgroupSize[1],
- numElementsPerWorkgroup[2] / pair.workgroupSize[0],
- numElementsPerWorkgroup[3]});
-
- return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
- op, tileSizes, {}, pipeline,
- pair.workgroupSize);
- }
- return success();
-}
-
-static LogicalResult setOpConfig(linalg::MatmulOp op) {
- ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
- ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);
-
- if (llvm::any_of(lhsShape, ShapedType::isDynamic) ||
- llvm::any_of(rhsShape, ShapedType::isDynamic)) {
- return success();
- }
-
- Type elementType =
- op.inputs()[0].getType().cast<ShapedType>().getElementType();
- int64_t matmulScale = lhsShape[0] * rhsShape[1];
- SmallVector<TileWorkgroupSizePair, 4> pairs;
- getMatmulTileAndWorkgroupSizes(matmulScale, elementType, pairs);
-
- for (TileWorkgroupSizePair pair : pairs) {
- if (lhsShape[0] % pair.tileSize[0] != 0 ||
- rhsShape[1] % pair.tileSize[1] != 0 ||
- lhsShape[1] % pair.tileSize[2] != 0) {
- continue;
- }
-
- auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
-
- SmallVector<int64_t, 4> numElementsPerWorkgroup(pair.tileSize.begin(),
- pair.tileSize.end());
-
- TileSizesListType tileSizes;
- // Workgroup level.
- tileSizes.push_back(numElementsPerWorkgroup);
- // No tiling at the subgroup level since this target doesn't use subgroup op
- // or shared memory.
- tileSizes.emplace_back();
- // Invocation level.
- tileSizes.push_back({numElementsPerWorkgroup[0] / pair.workgroupSize[1],
- numElementsPerWorkgroup[1] / pair.workgroupSize[0],
- numElementsPerWorkgroup[2]});
-
- return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
- op, tileSizes, {}, pipeline,
- pair.workgroupSize);
- }
- return success();
-}
-
//===----------------------------------------------------------------------===//
// Entry Point
//===----------------------------------------------------------------------===//
@@ -181,8 +27,17 @@
Operation *rootOp) {
int64_t subgroupSize = targetEnv.getResourceLimits().subgroup_size().getInt();
return TypeSwitch<Operation *, LogicalResult>(rootOp)
- .Case<linalg::BatchMatmulOp, linalg::MatmulOp>(
- [](auto op) { return setOpConfig(op); })
+ .Case<linalg::BatchMatmulOp, linalg::MatmulOp>([](auto op) {
+ std::array<int64_t, 2> workgroupXY = {8, 2};
+ std::array<int64_t, 3> threadMNK;
+ auto inputType = op.inputs()[0].getType().template cast<ShapedType>();
+ if (inputType.getElementType().isF16()) {
+ threadMNK = {8, 8, 4};
+ } else {
+ threadMNK = {6, 4, 4};
+ }
+ return setMatmulOpConfig(op, workgroupXY, threadMNK);
+ })
.Case<linalg::Conv2DNhwcHwcfOp>([subgroupSize](auto op) {
return setConvOpConfig(op, subgroupSize,
/*bestTilingFactor=*/16);
diff --git a/iree/compiler/Codegen/SPIRV/test/BUILD b/iree/compiler/Codegen/SPIRV/test/BUILD
index 440ed22..5f847d4 100644
--- a/iree/compiler/Codegen/SPIRV/test/BUILD
+++ b/iree/compiler/Codegen/SPIRV/test/BUILD
@@ -24,6 +24,7 @@
"config_linalg_ext_ops.mlir",
"config_linalg_ops.mlir",
"config_mali_conv.mlir",
+ "config_mali_matmul.mlir",
"convert_to_spirv.mlir",
"distribute_to_global_id.mlir",
"fold_gpu_procid_uses.mlir",
diff --git a/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt b/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
index 1495691..0d805b1 100644
--- a/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
@@ -19,6 +19,7 @@
"config_linalg_ext_ops.mlir"
"config_linalg_ops.mlir"
"config_mali_conv.mlir"
+ "config_mali_matmul.mlir"
"convert_to_spirv.mlir"
"distribute_to_global_id.mlir"
"fold_gpu_procid_uses.mlir"
diff --git a/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir
index 9027f61..015e4d1 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir
@@ -63,17 +63,17 @@
}
// CHECK-LABEL: hal.executable.entry_point public @matmul_1024x2048x512
-// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [128, 16]}
+// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [128, 32]}
// CHECK-SAME: workgroup_size = [32 : index, 2 : index, 1 : index]
// CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
// CHECK-NEXT: %[[ONE:.+]] = constant 1 : index
// CHECK-NEXT: %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%[[X]]]
-// CHECK-NEXT: %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%[[Y]]]
+// CHECK-NEXT: %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[Y]]]
// CHECK-NEXT: hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[ONE]]
// CHECK: func @matmul_1024x2048x512()
// CHECK: linalg.matmul
-// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[16, 128, 16], [], [8, 4, 16]]}
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[32, 128, 4], [], [16, 4, 4]]}
// -----
@@ -140,17 +140,17 @@
}
// CHECK-LABEL: hal.executable.entry_point public @matmul_3136x24x96
-// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [8, 224]}
+// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [8, 448]}
// CHECK-SAME: workgroup_size = [2 : index, 32 : index, 1 : index]
// CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
// CHECK-NEXT: %[[ONE:.+]] = constant 1 : index
// CHECK-NEXT: %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 8)>()[%[[X]]]
-// CHECK-NEXT: %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 224)>()[%[[Y]]]
+// CHECK-NEXT: %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 448)>()[%[[Y]]]
// CHECK-NEXT: hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[ONE]]
// CHECK: func @matmul_3136x24x96()
// CHECK: linalg.matmul
-// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[224, 8, 16], [], [7, 4, 16]]}
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[448, 8, 4], [], [14, 4, 4]]}
// -----
@@ -227,7 +227,7 @@
// CHECK: func @matmul_196x64x192()
// CHECK: linalg.matmul
-// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[28, 64, 16], [], [7, 4, 16]]}
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[28, 64, 8], [], [7, 4, 8]]}
// -----
@@ -289,17 +289,17 @@
}
// CHECK-LABEL: hal.executable.entry_point public @matmul_12544x96x16
-// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 64]}
+// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 128]}
// CHECK-SAME: workgroup_size = [8 : index, 8 : index, 1 : index]
// CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
// CHECK-NEXT: %[[ONE:.+]] = constant 1 : index
// CHECK-NEXT: %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[X]]]
-// CHECK-NEXT: %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%[[Y]]]
+// CHECK-NEXT: %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%[[Y]]]
// CHECK-NEXT: hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[ONE]]
// CHECK: func @matmul_12544x96x16()
// CHECK: linalg.matmul
-// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[64, 32, 16], [], [8, 4, 16]]}
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[128, 32, 4], [], [16, 4, 4]]}
// -----
@@ -376,7 +376,7 @@
// CHECK: func @matmul_49x160x576()
// CHECK: linalg.matmul
-// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[7, 32, 16], [], [7, 4, 16]]}
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[7, 32, 8], [], [7, 4, 8]]}
// -----
@@ -454,16 +454,16 @@
}
// CHECK-LABEL: hal.executable.entry_point public @batch_matmul_4x384x384
-// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [128, 16, 1]}
+// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [128, 32, 1]}
// CHECK-SAME: workgroup_size = [32 : index, 2 : index, 1 : index]
// CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %[[Z:.+]]: index):
// CHECK-NEXT: %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 128)>()[%[[X]]]
-// CHECK-NEXT: %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 16)>()[%[[Y]]]
+// CHECK-NEXT: %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[Y]]]
// CHECK-NEXT: hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[Z]]
// CHECK: func @batch_matmul_4x384x384()
// CHECK: linalg.batch_matmul
-// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[1, 16, 128, 16], [], [1, 8, 4, 16]]}
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[1, 32, 128, 4], [], [1, 16, 4, 4]]}
// -----
diff --git a/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir
new file mode 100644
index 0000000..9a04e8f
--- /dev/null
+++ b/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir
@@ -0,0 +1,553 @@
+// RUN: iree-opt -split-input-file -mlir-print-local-scope -pass-pipeline='hal.executable(hal.executable.variant(iree-spirv-lower-executable-target-pass{test-lowering-configuration=true}))' %s | IreeFileCheck %s
+
+// Large matmul that can match the best tiling scheme.
+
+hal.executable @matmul_1024x2048x512 {
+ hal.interface @io {
+ hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ hal.executable.variant @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+ spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, {
+ max_compute_shared_memory_size = 32768 : i32,
+ max_compute_workgroup_invocations = 512 : i32,
+ max_compute_workgroup_size = dense<512> : vector<3xi32>,
+ subgroup_size = 16 : i32}>
+ }> {
+ hal.executable.entry_point @matmul_1024x2048x512 attributes {interface = @io, ordinal = 0 : index}
+ builtin.module {
+ func @matmul_1024x2048x512() {
+ %c0 = constant 0 : index
+ %c2048 = constant 2048 : index
+ %c1024 = constant 1024 : index
+ %cst = constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:1024x512xf32>
+ %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:512x2048xf32>
+ %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1024x2048xf32>
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+ %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+ scf.for %arg0 = %3 to %c1024 step %4 {
+ %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+ %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+ scf.for %arg1 = %5 to %c2048 step %6 {
+ %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1024)>(%arg0)[%workgroup_size_y]
+ %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:1024x512xf32> -> tensor<?x512xf32>
+ %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 2048)>(%arg1)[%workgroup_size_x]
+ %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [512, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:512x2048xf32> -> tensor<512x?xf32>
+ %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1024)>(%arg0)[%workgroup_size_y]
+ %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 2048)>(%arg1)[%workgroup_size_x]
+ %13 = affine.min affine_map<(d0)[s0] -> (-d0 + 1024, s0)>(%arg0)[%workgroup_size_y]
+ %14 = affine.min affine_map<(d0)[s0] -> (-d0 + 2048, s0)>(%arg1)[%workgroup_size_x]
+ %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
+ %16 = linalg.fill(%cst, %15) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x512xf32>, tensor<512x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:1024x2048xf32>
+ }
+ }
+ return
+ }
+ hal.interface @io attributes {sym_visibility = "private"} {
+ hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ }
+ }
+}
+
+// CHECK-LABEL: hal.executable.entry_point public @matmul_1024x2048x512
+// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8]}
+// CHECK-SAME: workgroup_size = [8 : index, 2 : index, 1 : index]
+// CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
+// CHECK-NEXT: %[[ONE:.+]] = constant 1 : index
+// CHECK-NEXT: %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[X]]]
+// CHECK-NEXT: %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 8)>()[%[[Y]]]
+// CHECK-NEXT: hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[ONE]]
+
+// CHECK: func @matmul_1024x2048x512()
+// CHECK: linalg.matmul
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[8, 32, 4], [], [4, 4, 4]]}
+
+// -----
+
+// Small matmul N that can still tile to all threads in a workgroup.
+
+hal.executable @matmul_3136x24x96 {
+ hal.interface @io {
+ hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ hal.executable.variant @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+ spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, {
+ max_compute_shared_memory_size = 32768 : i32,
+ max_compute_workgroup_invocations = 512 : i32,
+ max_compute_workgroup_size = dense<512> : vector<3xi32>,
+ subgroup_size = 16 : i32}>
+ }> {
+ hal.executable.entry_point @matmul_3136x24x96 attributes {interface = @io, ordinal = 0 : index}
+ builtin.module {
+ func @matmul_3136x24x96() {
+ %c0 = constant 0 : index
+ %c24 = constant 24 : index
+ %c3136 = constant 3136 : index
+ %cst = constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:3136x96xf32>
+ %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:96x24xf32>
+ %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:3136x24xf32>
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+ %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+ scf.for %arg0 = %3 to %c3136 step %4 {
+ %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+ %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+ scf.for %arg1 = %5 to %c24 step %6 {
+ %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 3136)>(%arg0)[%workgroup_size_y]
+ %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 96], strides = [1, 1] : !flow.dispatch.tensor<readonly:3136x96xf32> -> tensor<?x96xf32>
+ %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 24)>(%arg1)[%workgroup_size_x]
+ %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [96, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:96x24xf32> -> tensor<96x?xf32>
+ %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 3136)>(%arg0)[%workgroup_size_y]
+ %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 24)>(%arg1)[%workgroup_size_x]
+ %13 = affine.min affine_map<(d0)[s0] -> (-d0 + 3136, s0)>(%arg0)[%workgroup_size_y]
+ %14 = affine.min affine_map<(d0)[s0] -> (-d0 + 24, s0)>(%arg1)[%workgroup_size_x]
+ %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
+ %16 = linalg.fill(%cst, %15) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x96xf32>, tensor<96x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:3136x24xf32>
+ }
+ }
+ return
+ }
+ hal.interface @io attributes {sym_visibility = "private"} {
+ hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ }
+ }
+}
+
+// CHECK-LABEL: hal.executable.entry_point public @matmul_3136x24x96
+// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [8, 32]}
+// CHECK-SAME: workgroup_size = [2 : index, 8 : index, 1 : index]
+// CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
+// CHECK-NEXT: %[[ONE:.+]] = constant 1 : index
+// CHECK-NEXT: %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 8)>()[%[[X]]]
+// CHECK-NEXT: %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[Y]]]
+// CHECK-NEXT: hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[ONE]]
+
+// CHECK: func @matmul_3136x24x96()
+// CHECK: linalg.matmul
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[32, 8, 4], [], [4, 4, 4]]}
+
+// -----
+
+// Small matmul M that can still tile to all threads in a workgroup.
+
+hal.executable @matmul_196x64x192 {
+ hal.interface @io {
+ hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ hal.executable.variant @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+ spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, {
+ max_compute_shared_memory_size = 32768 : i32,
+ max_compute_workgroup_invocations = 512 : i32,
+ max_compute_workgroup_size = dense<512> : vector<3xi32>,
+ subgroup_size = 16 : i32}>
+ }> {
+ hal.executable.entry_point @matmul_196x64x192 attributes {interface = @io, ordinal = 0 : index}
+ builtin.module {
+ func @matmul_196x64x192() {
+ %c0 = constant 0 : index
+ %c64 = constant 64 : index
+ %c196 = constant 196 : index
+ %cst = constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:196x192xf32>
+ %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:192x64xf32>
+ %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:196x64xf32>
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+ %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+ scf.for %arg0 = %3 to %c196 step %4 {
+ %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+ %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+ scf.for %arg1 = %5 to %c64 step %6 {
+ %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 196)>(%arg0)[%workgroup_size_y]
+ %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 192], strides = [1, 1] : !flow.dispatch.tensor<readonly:196x192xf32> -> tensor<?x192xf32>
+ %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 64)>(%arg1)[%workgroup_size_x]
+ %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [192, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:192x64xf32> -> tensor<192x?xf32>
+ %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 196)>(%arg0)[%workgroup_size_y]
+ %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 64)>(%arg1)[%workgroup_size_x]
+ %13 = affine.min affine_map<(d0)[s0] -> (-d0 + 196, s0)>(%arg0)[%workgroup_size_y]
+ %14 = affine.min affine_map<(d0)[s0] -> (-d0 + 64, s0)>(%arg1)[%workgroup_size_x]
+ %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
+ %16 = linalg.fill(%cst, %15) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x192xf32>, tensor<192x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:196x64xf32>
+ }
+ }
+ return
+ }
+ hal.interface @io attributes {sym_visibility = "private"} {
+ hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ }
+ }
+}
+
+// CHECK-LABEL: hal.executable.entry_point public @matmul_196x64x192
+// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 4]}
+// CHECK-SAME: workgroup_size = [8 : index, 2 : index, 1 : index]
+// CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
+// CHECK-NEXT: %[[ONE:.+]] = constant 1 : index
+// CHECK-NEXT: %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[X]]]
+// CHECK-NEXT: %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%[[Y]]]
+// CHECK-NEXT: hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[ONE]]
+
+// CHECK: func @matmul_196x64x192()
+// CHECK: linalg.matmul
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[4, 32, 8], [], [2, 4, 8]]}
+
+// -----
+
+// Small matmul K that can still tile to all threads in a workgroup.
+
+hal.executable @matmul_12544x96x16 {
+ hal.interface @io {
+ hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ hal.executable.variant @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+ spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, {
+ max_compute_shared_memory_size = 32768 : i32,
+ max_compute_workgroup_invocations = 512 : i32,
+ max_compute_workgroup_size = dense<512> : vector<3xi32>,
+ subgroup_size = 16 : i32}>
+ }> {
+ hal.executable.entry_point @matmul_12544x96x16 attributes {interface = @io, ordinal = 0 : index}
+ builtin.module {
+ func @matmul_12544x96x16() {
+ %c0 = constant 0 : index
+ %c96 = constant 96 : index
+ %c12544 = constant 12544 : index
+ %cst = constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<12544x16xf32>
+ %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<16x96xf32>
+ %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<12544x96xf32>
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+ %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+ scf.for %arg0 = %3 to %c12544 step %4 {
+ %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+ %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+ scf.for %arg1 = %5 to %c96 step %6 {
+ %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 12544)>(%arg0)[%workgroup_size_y]
+ %8 = memref.subview %0[%arg0, 0] [%7, 16] [1, 1] : memref<12544x16xf32> to memref<?x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 16 + s0 + d1)>>
+ %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 96)>(%arg1)[%workgroup_size_x]
+ %10 = memref.subview %1[0, %arg1] [16, %9] [1, 1] : memref<16x96xf32> to memref<16x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 96 + s0 + d1)>>
+ %11 = memref.subview %2[%arg0, %arg1] [%7, %9] [1, 1] : memref<12544x96xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 96 + s0 + d1)>>
+ linalg.fill(%cst, %11) : f32, memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 96 + s0 + d1)>>
+ linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : memref<?x16xf32, affine_map<(d0, d1)[s0] -> (d0 * 16 + s0 + d1)>>, memref<16x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 96 + s0 + d1)>>) outs(%11 : memref<?x?xf32, affine_map<(d0, d1)[s0] -> (d0 * 96 + s0 + d1)>>)
+ }
+ }
+ return
+ }
+ hal.interface @io attributes {sym_visibility = "private"} {
+ hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ }
+ }
+}
+
+// CHECK-LABEL: hal.executable.entry_point public @matmul_12544x96x16
+// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 8]}
+// CHECK-SAME: workgroup_size = [8 : index, 2 : index, 1 : index]
+// CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
+// CHECK-NEXT: %[[ONE:.+]] = constant 1 : index
+// CHECK-NEXT: %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[X]]]
+// CHECK-NEXT: %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 8)>()[%[[Y]]]
+// CHECK-NEXT: hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[ONE]]
+
+// CHECK: func @matmul_12544x96x16()
+// CHECK: linalg.matmul
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[8, 32, 4], [], [4, 4, 4]]}
+
+// -----
+
+// Odd matmul M and small N that cannot utilize all threads in a workgroup.
+
+hal.executable @matmul_49x160x576 {
+ hal.interface @io {
+ hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ hal.executable.variant @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+ spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, {
+ max_compute_shared_memory_size = 32768 : i32,
+ max_compute_workgroup_invocations = 512 : i32,
+ max_compute_workgroup_size = dense<512> : vector<3xi32>,
+ subgroup_size = 16 : i32}>
+ }> {
+ hal.executable.entry_point @matmul_49x160x576 attributes {interface = @io, ordinal = 0 : index}
+ builtin.module {
+ func @matmul_49x160x576() {
+ %c0 = constant 0 : index
+ %c160 = constant 160 : index
+ %c49 = constant 49 : index
+ %cst = constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:49x576xf32>
+ %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:576x160xf32>
+ %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:49x160xf32>
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+ %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+ scf.for %arg0 = %3 to %c49 step %4 {
+ %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+ %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+ scf.for %arg1 = %5 to %c160 step %6 {
+ %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 49)>(%arg0)[%workgroup_size_y]
+ %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 576], strides = [1, 1] : !flow.dispatch.tensor<readonly:49x576xf32> -> tensor<?x576xf32>
+ %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 160)>(%arg1)[%workgroup_size_x]
+ %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [576, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:576x160xf32> -> tensor<576x?xf32>
+ %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 49)>(%arg0)[%workgroup_size_y]
+ %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 160)>(%arg1)[%workgroup_size_x]
+ %13 = affine.min affine_map<(d0)[s0] -> (-d0 + 49, s0)>(%arg0)[%workgroup_size_y]
+ %14 = affine.min affine_map<(d0)[s0] -> (-d0 + 160, s0)>(%arg1)[%workgroup_size_x]
+ %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
+ %16 = linalg.fill(%cst, %15) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+ %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x576xf32>, tensor<576x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:49x160xf32>
+ }
+ }
+ return
+ }
+ hal.interface @io attributes {sym_visibility = "private"} {
+ hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ }
+ }
+}
+
+// CHECK-LABEL: hal.executable.entry_point public @matmul_49x160x576
+// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 1]}
+// CHECK-SAME: workgroup_size = [8 : index, 1 : index, 1 : index]
+// CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
+// CHECK-NEXT: %[[ONE:.+]] = constant 1 : index
+// CHECK-NEXT: %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[X]]]
+// CHECK-NEXT: hal.return %[[X_COUNT]], %[[Y]], %[[ONE]]
+
+// CHECK: func @matmul_49x160x576()
+// CHECK: linalg.matmul
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[1, 32, 8], [], [1, 4, 8]]}
+
+// -----
+
+// Large batch matmul.
+
+hal.executable @batch_matmul_4x384x384 {
+ hal.interface @io {
+ hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ hal.executable.variant @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+ spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, {
+ max_compute_shared_memory_size = 32768 : i32,
+ max_compute_workgroup_invocations = 512 : i32,
+ max_compute_workgroup_size = dense<512> : vector<3xi32>,
+ subgroup_size = 16 : i32}>
+ }> {
+ hal.executable.entry_point @batch_matmul_4x384x384 attributes {interface = @io, ordinal = 0 : index}
+ builtin.module {
+ func @batch_matmul_4x384x384() {
+ %c0 = constant 0 : index
+ %c384 = constant 384 : index
+ %c4 = constant 4 : index
+ %cst = constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:4x384x32xf32>
+ %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:4x32x384xf32>
+ %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:4x384x384xf32>
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_size_z = hal.interface.workgroup.size[2] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %workgroup_id_z = hal.interface.workgroup.id[2] : index
+ %workgroup_count_z = hal.interface.workgroup.count[2] : index
+ %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
+ %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
+ scf.for %arg0 = %3 to %c4 step %4 {
+ %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+ %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+ scf.for %arg1 = %5 to %c384 step %6 {
+ %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+ %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+ scf.for %arg2 = %7 to %c384 step %8 {
+ %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg0)[%workgroup_size_z]
+ %10 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 384)>(%arg1)[%workgroup_size_y]
+ %11 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%9, %10, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x384x32xf32> -> tensor<?x?x32xf32>
+ %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg0)[%workgroup_size_z]
+ %13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 384)>(%arg2)[%workgroup_size_x]
+ %14 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [%12, 32, %13], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x32x384xf32> -> tensor<?x32x?xf32>
+ %15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg0)[%workgroup_size_z]
+ %16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 384)>(%arg1)[%workgroup_size_y]
+ %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 384)>(%arg2)[%workgroup_size_x]
+ %18 = affine.min affine_map<(d0)[s0] -> (-d0 + 4, s0)>(%arg0)[%workgroup_size_z]
+ %19 = affine.min affine_map<(d0)[s0] -> (-d0 + 384, s0)>(%arg1)[%workgroup_size_y]
+ %20 = affine.min affine_map<(d0)[s0] -> (-d0 + 384, s0)>(%arg2)[%workgroup_size_x]
+ %21 = linalg.init_tensor [%18, %19, %20] : tensor<?x?x?xf32>
+ %22 = linalg.fill(%cst, %21) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
+ %23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%11, %14 : tensor<?x?x32xf32>, tensor<?x32x?xf32>) outs(%22 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+ flow.dispatch.tensor.store %23, %2, offsets = [%arg0, %arg1, %arg2], sizes = [%15, %16, %17], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:4x384x384xf32>
+ }
+ }
+ }
+ return
+ }
+ hal.interface @io attributes {sym_visibility = "private"} {
+ hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ }
+ }
+}
+
+// CHECK-LABEL: hal.executable.entry_point public @batch_matmul_4x384x384
+// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [32, 12, 1]}
+// CHECK-SAME: workgroup_size = [8 : index, 2 : index, 1 : index]
+// CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %[[Z:.+]]: index):
+// CHECK-NEXT: %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 32)>()[%[[X]]]
+// CHECK-NEXT: %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 12)>()[%[[Y]]]
+// CHECK-NEXT: hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[Z]]
+
+// CHECK: func @batch_matmul_4x384x384()
+// CHECK: linalg.batch_matmul
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[1, 12, 32, 4], [], [1, 6, 4, 4]]}
+
+// -----
+
+// Small batch matmul.
+
+hal.executable @batch_matmul_4x2x8 {
+ hal.interface @io {
+ hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ hal.executable.variant @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+ spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, {
+ max_compute_shared_memory_size = 32768 : i32,
+ max_compute_workgroup_invocations = 512 : i32,
+ max_compute_workgroup_size = dense<512> : vector<3xi32>,
+ subgroup_size = 16 : i32}>
+ }> {
+ hal.executable.entry_point @batch_matmul_4x2x8 attributes {interface = @io, ordinal = 0 : index}
+ builtin.module {
+ func @batch_matmul_4x2x8() {
+ %c0 = constant 0 : index
+ %c8 = constant 8 : index
+ %c2 = constant 2 : index
+ %c4 = constant 4 : index
+ %cst = constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:4x2x32xf32>
+ %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:4x32x8xf32>
+ %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:4x2x8xf32>
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_size_z = hal.interface.workgroup.size[2] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %workgroup_id_z = hal.interface.workgroup.id[2] : index
+ %workgroup_count_z = hal.interface.workgroup.count[2] : index
+ %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
+ %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
+ scf.for %arg0 = %3 to %c4 step %4 {
+ %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+ %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+ scf.for %arg1 = %5 to %c2 step %6 {
+ %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+ %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+ scf.for %arg2 = %7 to %c8 step %8 {
+ %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg0)[%workgroup_size_z]
+ %10 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 2)>(%arg1)[%workgroup_size_y]
+ %11 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%9, %10, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x2x32xf32> -> tensor<?x?x32xf32>
+ %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg0)[%workgroup_size_z]
+ %13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 8)>(%arg2)[%workgroup_size_x]
+ %14 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [%12, 32, %13], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x32x8xf32> -> tensor<?x32x?xf32>
+ %15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg0)[%workgroup_size_z]
+ %16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 2)>(%arg1)[%workgroup_size_y]
+ %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 8)>(%arg2)[%workgroup_size_x]
+ %18 = affine.min affine_map<(d0)[s0] -> (-d0 + 4, s0)>(%arg0)[%workgroup_size_z]
+ %19 = affine.min affine_map<(d0)[s0] -> (-d0 + 2, s0)>(%arg1)[%workgroup_size_y]
+ %20 = affine.min affine_map<(d0)[s0] -> (-d0 + 8, s0)>(%arg2)[%workgroup_size_x]
+ %21 = linalg.init_tensor [%18, %19, %20] : tensor<?x?x?xf32>
+ %22 = linalg.fill(%cst, %21) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
+ %23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%11, %14 : tensor<?x?x32xf32>, tensor<?x32x?xf32>) outs(%22 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+ flow.dispatch.tensor.store %23, %2, offsets = [%arg0, %arg1, %arg2], sizes = [%15, %16, %17], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:4x2x8xf32>
+ }
+ }
+ }
+ return
+ }
+ hal.interface @io attributes {sym_visibility = "private"} {
+ hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ }
+ }
+}
+
+// CHECK-LABEL: hal.executable.entry_point public @batch_matmul_4x2x8
+// CHECK-SAME: translation.info = {passPipeline = "SPIRVVectorize", workloadPerWorkgroup = [8, 2, 1]}
+// CHECK-SAME: workgroup_size = [2 : index, 2 : index, 1 : index]
+// CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %[[Z:.+]]: index):
+// CHECK-NEXT: %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 8)>()[%[[X]]]
+// CHECK-NEXT: %[[Y_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 2)>()[%[[Y]]]
+// CHECK-NEXT: hal.return %[[X_COUNT]], %[[Y_COUNT]], %[[Z]]
+
+// CHECK: func @batch_matmul_4x2x8()
+// CHECK: linalg.batch_matmul
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[1, 2, 8, 8], [], [1, 1, 4, 8]]}
diff --git a/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp b/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp
index 49b983d..3afdcba 100644
--- a/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp
+++ b/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp
@@ -207,7 +207,7 @@
SmallVector<mlir::ModuleOp, 8> innerModuleOps;
innerModuleOps.reserve(sourceExecutableOpGroups.size());
- for (auto hashExecutablePair : sourceExecutableOpGroups) {
+ for (const auto &hashExecutablePair : sourceExecutableOpGroups) {
llvm::hash_code hash = hashExecutablePair.first;
const auto &sourceExecutableOps = hashExecutablePair.second;
diff --git a/iree/vm/bytecode_dispatch_util.h b/iree/vm/bytecode_dispatch_util.h
index bce7fe8..5e17867 100644
--- a/iree/vm/bytecode_dispatch_util.h
+++ b/iree/vm/bytecode_dispatch_util.h
@@ -226,9 +226,9 @@
#define VM_DecOperandRegF64(name) \
*((double*)®s.i32[OP_I16(0) & (regs.i32_mask & ~1)]); \
pc += kRegSize;
-#define VM_DecOperandRegRef(name, out_is_move) \
- ®s.ref[OP_I16(0) & regs.ref_mask]; \
- *(out_is_move) = OP_I16(0) & IREE_REF_REGISTER_MOVE_BIT; \
+#define VM_DecOperandRegRef(name, out_is_move) \
+ ®s.ref[OP_I16(0) & regs.ref_mask]; \
+ *(out_is_move) = 0; /*= OP_I16(0) & IREE_REF_REGISTER_MOVE_BIT;*/ \
pc += kRegSize;
#define VM_DecVariadicOperands(name) \
VM_DecVariadicOperandsImpl(bytecode_data, &pc)
@@ -252,9 +252,9 @@
#define VM_DecResultRegF64(name) \
((double*)®s.i32[OP_I16(0) & (regs.i32_mask & ~1)]); \
pc += kRegSize;
-#define VM_DecResultRegRef(name, out_is_move) \
- ®s.ref[OP_I16(0) & regs.ref_mask]; \
- *(out_is_move) = OP_I16(0) & IREE_REF_REGISTER_MOVE_BIT; \
+#define VM_DecResultRegRef(name, out_is_move) \
+ ®s.ref[OP_I16(0) & regs.ref_mask]; \
+ *(out_is_move) = 0; /*= OP_I16(0) & IREE_REF_REGISTER_MOVE_BIT;*/ \
pc += kRegSize;
#define VM_DecVariadicResults(name) VM_DecVariadicOperands(name)
diff --git a/llvm-external-projects/iree-compiler-api/CMakeLists.txt b/llvm-external-projects/iree-compiler-api/CMakeLists.txt
index 4313f0b..76781b9 100644
--- a/llvm-external-projects/iree-compiler-api/CMakeLists.txt
+++ b/llvm-external-projects/iree-compiler-api/CMakeLists.txt
@@ -23,7 +23,7 @@
# a CMake min version of 3.0, which causes them to set it locally to OLD.
set(CMAKE_POLICY_DEFAULT_CMP0063 NEW)
-project(iree-compiler-backend LANGUAGES C CXX)
+project(iree-compiler-api LANGUAGES C CXX)
# Directory layout.
# When building in-tree, this directory exists relative to the overall
@@ -80,6 +80,10 @@
# MLIR_ENABLE_BINDINGS_PYTHON option.
set(MHLO_ENABLE_BINDINGS_PYTHON ON CACHE BOOL "" FORCE)
+# Required IREE settings.
+set(IREE_BUILD_PYTHON_BINDINGS ON CACHE BOOL "" FORCE)
+set(IREE_BUILD_OLD_PYTHON_COMPILER_API OFF CACHE BOOL "" FORCE)
+
# TODO: Fix this upstream. Each of these system include hacks is broken in
# a different way, so there is not an easy local fix. They should be removed
# one be one until this project builds. Since this is the first time all of this
diff --git a/llvm-external-projects/iree-compiler-api/build_tools/build_intree.sh b/llvm-external-projects/iree-compiler-api/build_tools/build_intree.sh
index a7a950f..362dde9 100755
--- a/llvm-external-projects/iree-compiler-api/build_tools/build_intree.sh
+++ b/llvm-external-projects/iree-compiler-api/build_tools/build_intree.sh
@@ -11,12 +11,19 @@
set -eu -o errtrace
project_dir="$(cd $(dirname $0)/.. && pwd)"
+workspace_dir="$project_dir/../.."
build_dir="$project_dir/build"
+# Write out a .env file to the workspace.
+echo "PYTHONPATH=$build_dir/python_package:$build_dir/iree/bindings/python" > $workspace_dir/.env
+
cmake -GNinja -B"$build_dir" "$project_dir" \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache \
- -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld \
+ -DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld \
+ -DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld \
+ -DLLVM_OPTIMIZED_TABLEGEN=ON \
"$@"
cd $build_dir
-ninja
+ninja all iree/bindings/python/all
diff --git a/llvm-external-projects/iree-dialects/BUILD b/llvm-external-projects/iree-dialects/BUILD
index eb4db6e..f476d4f 100644
--- a/llvm-external-projects/iree-dialects/BUILD
+++ b/llvm-external-projects/iree-dialects/BUILD
@@ -13,6 +13,7 @@
srcs = glob([
"include/iree-dialects/Dialect/IREE/*.td",
"include/iree-dialects/Dialect/IREEPyDM/IR/*.td",
+ "include/iree-dialects/Dialect/IREEPyDM/Transforms/*.td",
]),
)
@@ -144,20 +145,50 @@
],
)
+gentbl_cc_library(
+ name = "IREEPyDMTransformsIncGen",
+ strip_include_prefix = "include",
+ tbl_outs = [
+ (
+ ["-gen-pass-decls"],
+ "include/iree-dialects/Dialect/IREEPyDM/Transforms/Passes.h.inc",
+ ),
+ (
+ ["-gen-pass-capi-header"],
+ "include/iree-dialects/Dialect/IREEPyDM/Transforms/Passes.capi.h.inc",
+ ),
+ (
+ ["-gen-pass-capi-impl"],
+ "include/iree-dialects/Dialect/IREEPyDM/Transforms/Passes.capi.cpp.inc",
+ ),
+ ],
+ tblgen = "@llvm-project//mlir:mlir-tblgen",
+ td_file = "include/iree-dialects/Dialect/IREEPyDM/Transforms/Passes.td",
+ deps = [
+ ":TdFiles",
+ "@llvm-project//mlir:PassBaseTdFiles",
+ ],
+)
+
cc_library(
name = "IREEPyDMDialect",
srcs = glob([
"lib/Dialect/IREEPyDM/IR/*.cpp",
]),
- hdrs = glob(["include/iree-dialects/Dialect/IREEPyDM/IR/*.h"]),
+ hdrs = glob([
+ "include/iree-dialects/Dialect/IREEPyDM/IR/*.h",
+ "include/iree-dialects/Dialect/IREEPyDM/Transforms/*.h",
+ ]),
includes = ["include"],
deps = [
":IREEPyDMInterfacesIncGen",
":IREEPyDMOpsIncGen",
+ ":IREEPyDMTransformsIncGen",
"@llvm-project//llvm:Support",
"@llvm-project//mlir:CallOpInterfaces",
"@llvm-project//mlir:ControlFlowInterfaces",
"@llvm-project//mlir:IR",
+ "@llvm-project//mlir:Pass",
"@llvm-project//mlir:Support",
],
)
diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects-c/Dialects.h b/llvm-external-projects/iree-dialects/include/iree-dialects-c/Dialects.h
index 27a1ba2..613df95 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects-c/Dialects.h
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects-c/Dialects.h
@@ -8,6 +8,7 @@
#define IREE_LLVM_EXTERNAL_PROJECTS_IREE_DIALECTS_C_DIALECTS_H
#include "mlir-c/IR.h"
+#include "mlir-c/Pass.h"
#include "mlir-c/Registration.h"
#ifdef __cplusplus
@@ -50,6 +51,10 @@
MLIR_CAPI_EXPORTED MlirType mlirIREEPyDMObjectTypeGet(MlirContext context,
MlirType primitive);
+/// Builds a pass pipeline which lowers the iree_pydm dialect to IREE.
+MLIR_CAPI_EXPORTED void mlirIREEPyDMBuildLowerToIREEPassPipeline(
+ MlirOpPassManager passManager);
+
#ifdef __cplusplus
}
#endif
diff --git a/llvm-external-projects/iree-dialects/lib/CAPI/CMakeLists.txt b/llvm-external-projects/iree-dialects/lib/CAPI/CMakeLists.txt
index fc3317e..66682e5 100644
--- a/llvm-external-projects/iree-dialects/lib/CAPI/CMakeLists.txt
+++ b/llvm-external-projects/iree-dialects/lib/CAPI/CMakeLists.txt
@@ -5,6 +5,7 @@
MLIRIR
IREEDialectsIREEDialect
IREEDialectsIREEPyDMDialect
+ IREEDialectsIREEPyDMToIREEPasses
)
iree_dialects_target_includes(IREEDialectsCAPI)
diff --git a/llvm-external-projects/iree-dialects/lib/CAPI/Dialects.cpp b/llvm-external-projects/iree-dialects/lib/CAPI/Dialects.cpp
index 56b053a..ad5e546 100644
--- a/llvm-external-projects/iree-dialects/lib/CAPI/Dialects.cpp
+++ b/llvm-external-projects/iree-dialects/lib/CAPI/Dialects.cpp
@@ -8,7 +8,13 @@
#include "iree-dialects/Dialect/IREE/IREEDialect.h"
#include "iree-dialects/Dialect/IREEPyDM/IR/Dialect.h"
+#include "iree-dialects/Dialect/IREEPyDM/Transforms/Passes.h"
+#include "mlir/CAPI/IR.h"
+#include "mlir/CAPI/Pass.h"
#include "mlir/CAPI/Registration.h"
+#include "mlir/CAPI/Support.h"
+#include "mlir/CAPI/Utils.h"
+#include "mlir/CAPI/Wrap.h"
//===----------------------------------------------------------------------===//
// IREEDialect
@@ -59,3 +65,9 @@
auto cppType = unwrap(primitive).cast<mlir::iree_pydm::PrimitiveType>();
return wrap(mlir::iree_pydm::ObjectType::get(unwrap(ctx), cppType));
}
+
+void mlirIREEPyDMBuildLowerToIREEPassPipeline(MlirOpPassManager passManager) {
+ auto *passManagerCpp = unwrap(passManager);
+ // TODO: Should be a pass pipeline, not loose passes in the C impl.
+ passManagerCpp->addPass(mlir::iree_pydm::createConvertIREEPyDMToIREEPass());
+}
diff --git a/llvm-external-projects/iree-dialects/python/IREEDialectsModule.cpp b/llvm-external-projects/iree-dialects/python/IREEDialectsModule.cpp
index 00d280a..0b235ee 100644
--- a/llvm-external-projects/iree-dialects/python/IREEDialectsModule.cpp
+++ b/llvm-external-projects/iree-dialects/python/IREEDialectsModule.cpp
@@ -74,6 +74,15 @@
},
py::arg("context") = py::none(), py::arg("load") = true);
+ iree_pydm_m.def(
+ "build_lower_to_iree_pass_pipeline",
+ [](MlirPassManager passManager) {
+ MlirOpPassManager opPassManager =
+ mlirPassManagerGetAsOpPassManager(passManager);
+ mlirIREEPyDMBuildLowerToIREEPassPipeline(opPassManager);
+ },
+ py::arg("pass_manager"));
+
#define DEFINE_IREEPYDM_NULLARY_TYPE(Name) \
mlir_type_subclass(iree_pydm_m, #Name "Type", mlirTypeIsAIREEPyDM##Name, \
typeClass) \