Merge google -> main #7136
2e8546d Synchronize submodules with LLVM at llvm/llvm-project@f5b8f12
29317b6 Merge pull request Merge main -> google #7131 from rsuderman:main-to-google
a317e2c Move quantized type stripping to a separate pass.
87e6ba5 Migrate ScatterDimensionNumbers attribute definition from StructAttr to be a..
1d77498 Synchronize submodules with LLVM at llvm/llvm-project@f5b8f12
diff --git a/iree/compiler/Codegen/LLVMCPU/BUILD b/iree/compiler/Codegen/LLVMCPU/BUILD
index a71024b..f9e7750 100644
--- a/iree/compiler/Codegen/LLVMCPU/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/BUILD
@@ -16,7 +16,6 @@
"ConvertToLLVM.cpp",
"KernelDispatch.cpp",
"LLVMCPULowerExecutableTarget.cpp",
- "LLVMCPUPadWorkgroupTiles.cpp",
"LLVMCPUPlanConvLoopOrder.cpp",
"LLVMCPUSynchronizeSymbolVisibility.cpp",
"LLVMCPUTileAndVectorizeLinalgTensorOps.cpp",
diff --git a/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
index fb5643a..72597b1 100644
--- a/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
@@ -19,7 +19,6 @@
"ConvertToLLVM.cpp"
"KernelDispatch.cpp"
"LLVMCPULowerExecutableTarget.cpp"
- "LLVMCPUPadWorkgroupTiles.cpp"
"LLVMCPUPlanConvLoopOrder.cpp"
"LLVMCPUSynchronizeSymbolVisibility.cpp"
"LLVMCPUTileAndVectorizeLinalgTensorOps.cpp"
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPUPadWorkgroupTiles.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPUPadWorkgroupTiles.cpp
deleted file mode 100644
index 3f11b5e..0000000
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPUPadWorkgroupTiles.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-// Copyright 2021 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
-#include "iree/compiler/Codegen/PassDetail.h"
-#include "iree/compiler/Codegen/Passes.h"
-#include "iree/compiler/Codegen/Utils/Utils.h"
-#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
-#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-#define DEBUG_TYPE "iree-llvmcpu-pad-workgroup-tiles"
-
-namespace mlir {
-namespace iree_compiler {
-
-namespace {
-
-// Creates linalg.pad_tensor op with constant padding.
-static linalg::PadTensorOp createPadTensorOpWithStaticPadding(
- PatternRewriter &rewriter, mlir::Location loc, Type resultType, Value input,
- Value padding, ArrayRef<int64_t> lowPadding,
- ArrayRef<int64_t> highPadding) {
- auto padTensorOp = rewriter.create<linalg::PadTensorOp>(
- loc, resultType, input, ArrayRef<Value>{}, ArrayRef<Value>{},
- rewriter.getI64ArrayAttr(lowPadding),
- rewriter.getI64ArrayAttr(highPadding));
-
- int rank = padTensorOp.getResultType().getRank();
- SmallVector<Type, 4> blockArgTypes;
- blockArgTypes.assign(rank, rewriter.getIndexType());
- auto ®ion = padTensorOp.region();
- OpBuilder::InsertionGuard guard(rewriter);
- rewriter.createBlock(®ion, region.end(), blockArgTypes);
- rewriter.create<linalg::YieldOp>(loc, padding);
- return padTensorOp;
-}
-
-// Returns padding for dim to next integer multiple of the vector size.
-static std::pair<int, int> getVectorPaddingSize(int dim, int vecSize) {
- if (dim < vecSize) {
- int size = vecSize;
- int padValue = size - dim;
- return {vecSize, padValue};
- } else {
- int size = ceil((float)dim / (float)vecSize) * vecSize;
- int padValue = size - dim;
- return {size, padValue};
- }
-}
-
-// Returns padding for dim to next integer multiple of the workgroup size.
-// Note: KernelDispatch gurantess workgroup size is the largest integer
-// multiple of the vector size.
-std::pair<int, int> getWorkgroupPaddedTileSize(int dim, int workgoupSize,
- int vecSize) {
- if (dim > workgoupSize) {
- int size = ceil((float)dim / (float)workgoupSize) * workgoupSize;
- int padValue = size - dim;
- return {workgoupSize, padValue};
- } else {
- return getVectorPaddingSize(dim, vecSize);
- }
-}
-
-// Creates linalg.matmul with operands padded to the next integer multiple of
-// the workgroup size.
-class MatmulWorkgroupTilesPadding : public OpRewritePattern<linalg::MatmulOp> {
- public:
- using OpRewritePattern<linalg::MatmulOp>::OpRewritePattern;
-
- LogicalResult matchAndRewrite(linalg::MatmulOp matmulOp,
- PatternRewriter &rewriter) const override {
- if (!getLoweringConfig(matmulOp)) return failure();
- auto loc = matmulOp.getLoc();
- auto lhs = matmulOp.getInputOperand(0)->get();
- auto rhs = matmulOp.getInputOperand(1)->get();
- auto result = matmulOp.getOutputOperand(0)->get();
-
- if (lhs.getDefiningOp<linalg::PadTensorOp>() ||
- rhs.getDefiningOp<linalg::PadTensorOp>())
- return failure();
-
- auto workgroupTileSizes = getTileSizes(
- matmulOp, static_cast<unsigned>(TilingLevel::WorkGroupTiles));
- auto vectorTileSizes =
- getTileSizes(matmulOp, static_cast<unsigned>(TilingLevel::Level2Tiles));
- if (workgroupTileSizes.empty() || vectorTileSizes.empty()) return failure();
-
- auto lhsFullSize = getUntiledShape(lhs);
- auto rhsFullSize = getUntiledShape(rhs);
- if (lhsFullSize.empty() || rhsFullSize.empty()) return failure();
-
- int problemSizeM = lhsFullSize[0];
- int problemSizeN = rhsFullSize[1];
- int problemSizeK = lhsFullSize[1];
-
- int paddedMSize, paddedNSize, paddedKSize;
- int paddingForM, paddingForN, paddingForK;
- std::tie(paddedMSize, paddingForM) = getWorkgroupPaddedTileSize(
- problemSizeM, workgroupTileSizes[0], vectorTileSizes[0]);
- std::tie(paddedNSize, paddingForN) = getWorkgroupPaddedTileSize(
- problemSizeN, workgroupTileSizes[1], vectorTileSizes[1]);
- std::tie(paddedKSize, paddingForK) =
- getVectorPaddingSize(problemSizeK, vectorTileSizes[2]);
-
- // No padding.
- if (paddingForM == 0 && paddingForN == 0 && paddingForK == 0)
- return failure();
-
- DEBUG_WITH_TYPE(DEBUG_TYPE, {
- auto l1TileSizes = getTileSizes(
- matmulOp, static_cast<unsigned>(TilingLevel::Level1Tiles));
- llvm::dbgs() << "Problem-size: "
- << "[" << problemSizeM << "," << problemSizeK << "]"
- << ", "
- << "[" << problemSizeK << "," << problemSizeN << "]\n";
- llvm::dbgs() << "Workgroup-tile-sizes:"
- << "[" << workgroupTileSizes[0] << ", "
- << workgroupTileSizes[1] << "]\n";
- llvm::dbgs() << "L1-tile-sizes:"
- << "[" << l1TileSizes[0] << ", " << l1TileSizes[1] << ","
- << l1TileSizes[2] << "]\n";
- llvm::dbgs() << "Vector-tile-sizes:"
- << "[" << vectorTileSizes[0] << ", " << vectorTileSizes[1]
- << ", " << vectorTileSizes[2] << "]\n";
- auto lhsStackSize = paddedMSize * paddedKSize * 4;
- auto rhsStackSize = paddedKSize * paddedNSize * 4;
- auto outputStackSize = paddedMSize * paddedNSize * 4;
- llvm::dbgs() << "LHS after padding:"
- << "[" << paddedMSize << "," << paddedKSize
- << "], size_in_bytes = " << lhsStackSize << "\n";
- llvm::dbgs() << "RHS after padding:"
- << "[" << paddedKSize << "," << paddedNSize
- << "], size_in_bytes = " << rhsStackSize << "\n";
- llvm::dbgs() << "Result after padding:"
- << "[" << paddedMSize << "," << paddedNSize
- << "], size_in_bytes = " << outputStackSize << "\n";
- });
-
- auto getPaddedOperand = [&](Value operand, ArrayRef<int64_t> shape,
- ArrayRef<int64_t> highPadding) -> Value {
- if (llvm::all_of(highPadding,
- [](int64_t val) -> bool { return val == 0; })) {
- return operand;
- }
- auto elementType =
- operand.getType().cast<RankedTensorType>().getElementType();
- auto paddedType = RankedTensorType::get(shape, elementType);
- auto paddingValue =
- rewriter.create<ConstantOp>(loc, rewriter.getZeroAttr(elementType));
- auto paddedOperand =
- createPadTensorOpWithStaticPadding(rewriter, loc, paddedType, operand,
- paddingValue, {0, 0}, highPadding);
- return paddedOperand;
- };
-
- auto paddedLhs = getPaddedOperand(lhs, {paddedMSize, paddedKSize},
- {paddingForM, paddingForK});
-
- auto paddedrhs = getPaddedOperand(rhs, {paddedKSize, paddedNSize},
- {paddingForK, paddingForN});
-
- auto resultType = RankedTensorType::get(
- {paddedMSize, paddedNSize},
- result.getType().cast<RankedTensorType>().getElementType());
-
- // Padding for K-dim only result doesn't change result size.
- if (paddingForM == 0 && paddingForN == 0) {
- auto paddedMatmulOp =
- cast<linalg::LinalgOp>(matmulOp.getOperation())
- .clone(rewriter, loc, {resultType},
- ArrayRef<Value>{paddedLhs, paddedrhs, result});
- rewriter.replaceOp(matmulOp, paddedMatmulOp->getResults());
- } else {
- // Padding eather M or N requires changing the result size.
- auto getActualSizes = [](Value operand) {
- auto defOp = operand.getDefiningOp<IREE::Flow::DispatchTensorLoadOp>();
- return defOp.sizes();
- };
- // Get the actual output tile size (before padding).
- auto lhsSizes = getActualSizes(lhs);
- auto rhsSizes = getActualSizes(rhs);
- SmallVector<OpFoldResult> sizes;
- if (lhsSizes.empty()) {
- sizes.push_back(rewriter.getIndexAttr(paddedMSize));
- } else {
- sizes.push_back(lhsSizes.front());
- }
- if (rhsSizes.empty()) {
- sizes.push_back(rewriter.getIndexAttr(paddedNSize));
- } else {
- sizes.push_back(rhsSizes.back());
- }
- auto elementType = matmulOp.getResults()[0]
- .getType()
- .cast<ShapedType>()
- .getElementType();
- auto staticResult = rewriter.create<linalg::InitTensorOp>(
- loc, ArrayRef<int64_t>{paddedMSize, paddedNSize}, elementType);
-
- Value zero =
- rewriter.create<ConstantOp>(loc, rewriter.getZeroAttr(elementType));
- auto filledStaticResult =
- rewriter.create<linalg::FillOp>(loc, zero, staticResult);
- auto paddedMatmulOp =
- cast<linalg::LinalgOp>(matmulOp.getOperation())
- .clone(rewriter, loc, {resultType},
- ArrayRef<Value>{paddedLhs, paddedrhs,
- filledStaticResult.result()});
- SmallVector<OpFoldResult> offsets(2, rewriter.getI64IntegerAttr(0));
- SmallVector<OpFoldResult> strides(2, rewriter.getI64IntegerAttr(1));
- rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
- matmulOp, paddedMatmulOp->getResults()[0], offsets, sizes, strides);
- }
- return success();
- }
-};
-
-struct LLVMCPUPadWorkgroupTilesPass
- : LLVMCPUPadWorkgroupTilesBase<LLVMCPUPadWorkgroupTilesPass> {
- void getDependentDialects(DialectRegistry ®istry) const override {
- registry.insert<linalg::LinalgDialect>();
- }
- void runOnOperation() override {
- MLIRContext *context = &getContext();
- OwningRewritePatternList patterns(&getContext());
- patterns.insert<MatmulWorkgroupTilesPadding>(context);
- (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
- }
-};
-} // namespace
-
-std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUPadWorkgroupTilesPass() {
- return std::make_unique<LLVMCPUPadWorkgroupTilesPass>();
-}
-
-} // namespace iree_compiler
-} // namespace mlir
diff --git a/iree/compiler/Codegen/LLVMCPU/test/BUILD b/iree/compiler/Codegen/LLVMCPU/test/BUILD
index a5631f2..0327a2f 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/test/BUILD
@@ -24,7 +24,6 @@
"hal_interface_workgroup_info.mlir",
"materialize_launch_configuration.mlir",
"matmul_vectorization.mlir",
- "pad_workgroup_tiles.mlir",
"plan_conv_loop_order.mlir",
"synchronize_symbol_visibility.mlir",
"tile_and_vectorize.mlir",
diff --git a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index 3c94a39..759e6d4 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -19,7 +19,6 @@
"hal_interface_workgroup_info.mlir"
"materialize_launch_configuration.mlir"
"matmul_vectorization.mlir"
- "pad_workgroup_tiles.mlir"
"plan_conv_loop_order.mlir"
"synchronize_symbol_visibility.mlir"
"tile_and_vectorize.mlir"
diff --git a/iree/compiler/Codegen/LLVMCPU/test/pad_workgroup_tiles.mlir b/iree/compiler/Codegen/LLVMCPU/test/pad_workgroup_tiles.mlir
deleted file mode 100644
index e601390..0000000
--- a/iree/compiler/Codegen/LLVMCPU/test/pad_workgroup_tiles.mlir
+++ /dev/null
@@ -1,66 +0,0 @@
-// RUN: iree-opt %s -cse -iree-llvmcpu-pad-workgroup-tiles -split-input-file | IreeFileCheck %s
-
-#config0 = {tileSizes = [[64, 64]]}
-#config1 = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}
-module {
- func @matmul_f32_5x3x5() {
- %c0 = constant 0 : index
- %cst = constant 0.000000e+00 : f32
- %c5 = constant 5 : index
- %c64 = constant 64 : index
- %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:5x3xf32>
- %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x5xf32>
- %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:5x5xf32>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c64]
- %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %c64]
- scf.for %arg0 = %3 to %c5 step %4 {
- %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c64]
- %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %c64]
- scf.for %arg1 = %5 to %c5 step %6 {
- %7 = affine.min affine_map<(d0) -> (64, -d0 + 5)>(%arg0)
- %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:5x3xf32> -> tensor<?x3xf32>
- %9 = affine.min affine_map<(d0) -> (64, -d0 + 5)>(%arg1)
- %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [3, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:3x5xf32> -> tensor<3x?xf32>
- %11 = affine.min affine_map<(d0) -> (64, -d0 + 5)>(%arg0)
- %12 = affine.min affine_map<(d0) -> (64, -d0 + 5)>(%arg1)
- %13 = affine.min affine_map<(d0) -> (-d0 + 5, 64)>(%arg0)
- %14 = affine.min affine_map<(d0) -> (-d0 + 5, 64)>(%arg1)
- %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
- %16 = linalg.fill(%cst, %15) {__internal_linalg_transform__ = "workgroup", lowering.config = #config0} : f32, tensor<?x?xf32> -> tensor<?x?xf32>
- %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = #config1} ins(%8, %10 : tensor<?x3xf32>, tensor<3x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:5x5xf32>
- }
- }
- return
- }
-
- hal.interface private @io {
- hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
- hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
- hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
- }
-}
-// CHECK-LABEL: @matmul_f32_5x3x5
-// CHECK: %[[C0:.+]] = constant 0.000000e+00 : f32
-// CHECK: %[[LHS:.+]] = hal.interface.binding.subspan @io::@s0b0_ro_external[%{{.*}}] : !flow.dispatch.tensor<readonly:5x3xf32>
-// CHECK: %[[RHS:.+]] = hal.interface.binding.subspan @io::@s0b1_ro_external[%{{.*}}] : !flow.dispatch.tensor<readonly:3x5xf32>
-// CHECK: %[[RESULT:.+]] = hal.interface.binding.subspan @io::@s0b2_xw_external[%{{.*}}] : !flow.dispatch.tensor<writeonly:5x5xf32>
-// CHECK: flow.dispatch.tensor.load %[[LHS]], offsets = [%{{.*}}, 0], sizes = [%[[LHS_TILE_SIZE:.+]], 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:5x3xf32> -> tensor<?x3xf32>
-// CHECK: flow.dispatch.tensor.load %[[RHS]], offsets = [0, %{{.*}}], sizes = [3, %[[RHS_TILE_SIZE:.+]]], strides = [1, 1] : !flow.dispatch.tensor<readonly:3x5xf32> -> tensor<3x?xf32>
-// CHECK: %[[PADDED_LHS:.+]] = linalg.pad_tensor %{{.*}} low[0, 0] high[3, 1] {
-// CHECK-NEXT: ^bb0(%{{.*}}: index, %{{.*}}: index): // no predecessors
-// CHECK-NEXT: linalg.yield %[[C0]] : f32
-// CHECK-NEXT: tensor<?x3xf32> to tensor<8x4xf32>
-// CHECK: %[[PADDED_RHS:.+]] = linalg.pad_tensor %{{.*}} low[0, 0] high[1, 3] {
-// CHECK-NEXT: ^bb0(%{{.*}}: index, %{{.*}}: index): // no predecessors
-// CHECK-NEXT: linalg.yield %[[C0]] : f32
-// CHECK-NEXT: tensor<3x?xf32> to tensor<4x8xf32>
-// CHECK: %[[PADDED_RESULT:.+]] = linalg.init_tensor [8, 8] : tensor<8x8xf32>
-// CHECK: %[[PADDED_RESULT_0:.+]] = linalg.fill(%[[C0]], %[[PADDED_RESULT]]) : f32, tensor<8x8xf32>
-// CHECK: %[[MATMUL_RESULT:.+]] = linalg.matmul {{.*}} ins(%[[PADDED_LHS]], %[[PADDED_RHS]] : tensor<8x4xf32>, tensor<4x8xf32>) outs(%[[PADDED_RESULT_0]] : tensor<8x8xf32>) -> tensor<8x8xf32>
-// CHECK: %[[CLIPED_RESULT:.+]] = tensor.extract_slice %[[MATMUL_RESULT]][0, 0] [%[[LHS_TILE_SIZE]], %[[RHS_TILE_SIZE]]] [1, 1] : tensor<8x8xf32> to tensor<?x?xf32>
-// CHECK: flow.dispatch.tensor.store %[[CLIPED_RESULT]], %[[RESULT]], offsets = [%{{.*}}, %{{.*}}], sizes = [%[[LHS_TILE_SIZE]], %[[RHS_TILE_SIZE]]], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:5x5xf32>
diff --git a/iree/compiler/Codegen/Passes.h b/iree/compiler/Codegen/Passes.h
index 5417c3a..8c86c1d 100644
--- a/iree/compiler/Codegen/Passes.h
+++ b/iree/compiler/Codegen/Passes.h
@@ -116,10 +116,6 @@
std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
createLLVMCPULowerExecutableTargetPass(bool lowerToVectors = true);
-/// Pad linalg ops workgroup tiles into the next integer multiple of the target
-/// vector size.
-std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUPadWorkgroupTilesPass();
-
/// Converts linalg.conv into linalg.generic with a CPU-friendly iteration
/// order.
std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUPlanConvLoopOrderPass();
diff --git a/iree/compiler/Codegen/Passes.td b/iree/compiler/Codegen/Passes.td
index 6233052..c74d2a4 100644
--- a/iree/compiler/Codegen/Passes.td
+++ b/iree/compiler/Codegen/Passes.td
@@ -103,13 +103,6 @@
"mlir::iree_compiler::createLLVMCPULowerExecutableTargetPass()";
}
-def LLVMCPUPadWorkgroupTiles :
- Pass<"iree-llvmcpu-pad-workgroup-tiles", "FuncOp"> {
- let summary =
- "Pad workgroup tiles to an integer multiple of tiling parameters.";
- let constructor = "mlir::iree_compiler::createLLVMCPUPadWorkgroupTilesPass()";
-}
-
def LLVMCPUPlanConvLoopOrder :
Pass<"iree-llvmcpu-plan-conv-loop-order", "FuncOp"> {
let summary =
diff --git a/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
index fa57109..cf8c39d 100644
--- a/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
+++ b/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
@@ -208,6 +208,10 @@
LogicalResult setMatmulOpConfig(linalg::LinalgOp op,
std::array<int64_t, 2> bestWorkgroupSizeXY,
std::array<int64_t, 3> bestThreadTileSizeMNK) {
+ auto lhsType = op.inputs()[0].getType().cast<ShapedType>();
+ auto elementBits = lhsType.getElementType().getIntOrFloatBitWidth();
+ if (elementBits != 16 && elementBits != 32) return success();
+
ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);
if (llvm::any_of(lhsShape, ShapedType::isDynamic)) return success();
@@ -282,12 +286,13 @@
// Deduce the configuration for the K dimension. We need some power of two
// here so that we can do vector load.
- for (int64_t t = llvm::PowerOf2Floor(residualTilingFactor); t >= 1; t >>= 1) {
+ for (int64_t t = llvm::PowerOf2Floor(residualTilingFactor); t >= 2; t >>= 1) {
if (dimK % t == 0) {
workgroupTileSizes[2 + isBM] = invocationTileSizes[2 + isBM] = t;
break;
}
}
+ if (workgroupTileSizes[2 + isBM] == 0) return success();
auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
TileSizesListType tileSizes;
diff --git a/iree/compiler/Codegen/SPIRV/test/BUILD b/iree/compiler/Codegen/SPIRV/test/BUILD
index 5f847d4..75967bc 100644
--- a/iree/compiler/Codegen/SPIRV/test/BUILD
+++ b/iree/compiler/Codegen/SPIRV/test/BUILD
@@ -21,6 +21,7 @@
[
"config_adreno_conv.mlir",
"config_adreno_matmul.mlir",
+ "config_default_matmul.mlir",
"config_linalg_ext_ops.mlir",
"config_linalg_ops.mlir",
"config_mali_conv.mlir",
diff --git a/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt b/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
index 0d805b1..b42fcef 100644
--- a/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
@@ -16,6 +16,7 @@
SRCS
"config_adreno_conv.mlir"
"config_adreno_matmul.mlir"
+ "config_default_matmul.mlir"
"config_linalg_ext_ops.mlir"
"config_linalg_ops.mlir"
"config_mali_conv.mlir"
diff --git a/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir
new file mode 100644
index 0000000..6cbdd91
--- /dev/null
+++ b/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir
@@ -0,0 +1,162 @@
+// RUN: iree-opt -split-input-file -mlir-print-local-scope -pass-pipeline='hal.executable(hal.executable.variant(iree-spirv-lower-executable-target-pass{test-lowering-configuration=true}))' %s | IreeFileCheck %s
+
+// Odd K that forbids vectorization.
+
+hal.executable @batch_matmul_1x3x32 {
+ hal.interface public @io {
+ hal.interface.binding public @s0b0_ro_constant, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+ spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, {
+ max_compute_shared_memory_size = 16384 : i32,
+ max_compute_workgroup_invocations = 128 : i32,
+ max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>,
+ subgroup_size = 4 : i32}>
+ }> {
+ hal.executable.entry_point public @batch_matmul_1x3x32 attributes {interface = @io, ordinal = 0 : index}
+ builtin.module {
+ func @batch_matmul_1x3x32() {
+ %c0 = constant 0 : index
+ %c32 = constant 32 : index
+ %c3 = constant 3 : index
+ %c1 = constant 1 : index
+ %cst = constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x3x3xf32>
+ %1 = hal.interface.binding.subspan @io::@s0b0_ro_constant[%c0] : !flow.dispatch.tensor<readonly:1x3x32xf32>
+ %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x3x32xf32>
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_size_z = hal.interface.workgroup.size[2] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %workgroup_id_z = hal.interface.workgroup.id[2] : index
+ %workgroup_count_z = hal.interface.workgroup.count[2] : index
+ %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
+ %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
+ scf.for %arg0 = %3 to %c1 step %4 {
+ %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+ %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+ scf.for %arg1 = %5 to %c3 step %6 {
+ %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+ %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+ scf.for %arg2 = %7 to %c32 step %8 {
+ %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1)>(%arg0)[%workgroup_size_z]
+ %10 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 3)>(%arg1)[%workgroup_size_y]
+ %11 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%9, %10, 3], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:1x3x3xf32> -> tensor<?x?x3xf32>
+ %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1)>(%arg0)[%workgroup_size_z]
+ %13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
+ %14 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [%12, 3, %13], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:1x3x32xf32> -> tensor<?x3x?xf32>
+ %15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1)>(%arg0)[%workgroup_size_z]
+ %16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 3)>(%arg1)[%workgroup_size_y]
+ %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
+ %18 = affine.min affine_map<(d0)[s0] -> (-d0 + 1, s0)>(%arg0)[%workgroup_size_z]
+ %19 = affine.min affine_map<(d0)[s0] -> (-d0 + 3, s0)>(%arg1)[%workgroup_size_y]
+ %20 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
+ %21 = linalg.init_tensor [%18, %19, %20] : tensor<?x?x?xf32>
+ %22 = linalg.fill(%cst, %21) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
+ %23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%11, %14 : tensor<?x?x3xf32>, tensor<?x3x?xf32>) outs(%22 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+ flow.dispatch.tensor.store %23, %2, offsets = [%arg0, %arg1, %arg2], sizes = [%15, %16, %17], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x3x32xf32>
+ }
+ }
+ }
+ return
+ }
+ hal.interface private @io {
+ hal.interface.binding public @s0b0_ro_constant, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ }
+ }
+}
+
+// CHECK-LABEL: hal.executable.entry_point public @batch_matmul_1x3x32
+// CHECK-SAME: translation.info = {passPipeline = "SPIRVDistribute", workloadPerWorkgroup = [4, 1, 1]}
+// CHECK-SAME: workgroup_size = [4 : index, 1 : index, 1 : index]
+// CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %[[Z:.+]]: index):
+// CHECK-NEXT: %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%[[X]]]
+// CHECK-NEXT: hal.return %[[X_COUNT]], %[[Y]], %[[Z]]
+
+// CHECK: func @batch_matmul_1x3x32()
+// CHECK: linalg.batch_matmul
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[1, 1, 4], [], [1, 1, 1]]}
+
+// -----
+
+// Non-16 / non-32 bit types cannot be vectorized right now.
+
+hal.executable private @matmul_64x16 {
+ hal.interface public @io {
+ hal.interface.binding public @s0b0_ro_constant, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+ spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, {
+ max_compute_shared_memory_size = 16384 : i32,
+ max_compute_workgroup_invocations = 128 : i32,
+ max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>,
+ subgroup_size = 4 : i32}>
+ }> {
+ hal.executable.entry_point public @matmul_64x16 attributes {interface = @io, ordinal = 0 : index}
+ builtin.module {
+ func @matmul_64x16() {
+ %c0 = constant 0 : index
+ %c16 = constant 16 : index
+ %c64 = constant 64 : index
+ %c0_i32 = constant 0 : i32
+ %0 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:64x32xi8>
+ %1 = hal.interface.binding.subspan @io::@s0b0_ro_constant[%c0] : !flow.dispatch.tensor<readonly:32x16xi8>
+ %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:64x16xi32>
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+ %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+ scf.for %arg0 = %3 to %c64 step %4 {
+ %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+ %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+ scf.for %arg1 = %5 to %c16 step %6 {
+ %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 64)>(%arg0)[%workgroup_size_y]
+ %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:64x32xi8> -> tensor<?x32xi8>
+ %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 16)>(%arg1)[%workgroup_size_x]
+ %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [32, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xi8> -> tensor<32x?xi8>
+ %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 64)>(%arg0)[%workgroup_size_y]
+ %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 16)>(%arg1)[%workgroup_size_x]
+ %13 = affine.min affine_map<(d0)[s0] -> (-d0 + 64, s0)>(%arg0)[%workgroup_size_y]
+ %14 = affine.min affine_map<(d0)[s0] -> (-d0 + 16, s0)>(%arg1)[%workgroup_size_x]
+ %15 = linalg.init_tensor [%13, %14] : tensor<?x?xi32>
+ %16 = linalg.fill(%c0_i32, %15) : i32, tensor<?x?xi32> -> tensor<?x?xi32>
+ %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x32xi8>, tensor<32x?xi8>) outs(%16 : tensor<?x?xi32>) -> tensor<?x?xi32>
+ flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:64x16xi32>
+ }
+ }
+ return
+ }
+ hal.interface private @io {
+ hal.interface.binding public @s0b0_ro_constant, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ }
+ }
+}
+
+// CHECK-LABEL: hal.executable.entry_point public @matmul_64x16
+// CHECK-SAME: translation.info = {passPipeline = "SPIRVDistribute", workloadPerWorkgroup = [4, 1]}
+// CHECK-SAME: workgroup_size = [4 : index, 1 : index, 1 : index]
+// CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
+// CHECK-NEXT: %[[ONE:.+]] = constant 1 : index
+// CHECK-NEXT: %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%[[X]]]
+// CHECK-NEXT: hal.return %[[X_COUNT]], %[[Y]], %[[ONE]]
+
+// CHECK: func @matmul_64x16()
+// CHECK: linalg.matmul
+// CHECK-SAME{LITERAL}: lowering.config = {tileSizes = [[1, 4], [], [1, 1]]}
diff --git a/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp b/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp
index 657866f..30c7932 100644
--- a/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp
+++ b/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp
@@ -717,6 +717,9 @@
auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
if (!linalgOp || !linalgOp.hasTensorSemantics()) return failure();
if (!hasRootOpAttribute(op)) return failure();
+ if (op->getParentOfType<IREE::Flow::DispatchWorkgroupsOp>()) {
+ return failure();
+ }
// TODO(ravishankarm): It is getting strange to track when to apply this
// pattern and when not to. Need to revisit this, with dynamic shape cases
@@ -798,6 +801,9 @@
PatternRewriter &rewriter) const override {
if (!hasRootOpAttribute(tilableOp)) return failure();
if (hasOnlyDimUses(tilableOp)) return failure();
+ if (tilableOp->getParentOfType<IREE::Flow::DispatchWorkgroupsOp>()) {
+ return failure();
+ }
SmallVector<StringRef> iteratorTypes = tilableOp.getLoopIteratorTypes();
SmallVector<Range> loopRanges = tilableOp.getLoopBounds(rewriter);
@@ -1079,7 +1085,7 @@
// order here.
for (Operation &op : llvm::reverse(block)) {
// Start with a root operation and fuse its producers.
- if (!isRootOp(&op)) continue;
+ if (hasFusionGroupsAttribute(&op) || !isRootOp(&op)) continue;
unsigned newGroup = numRootOps++;
setRootAttribute(context, &op, newGroup);
@@ -1127,6 +1133,11 @@
consumerIndexingMap.getResults()) {
continue;
}
+ if (llvm::any_of(
+ consumer.getOutputOperands(), [&consumer](OpOperand *operand) {
+ return !consumer.getTiedIndexingMap(operand).isIdentity();
+ }))
+ continue;
int64_t rootNumber = getRootNumber(op);
setRootAttribute(context, user, rootNumber);
removeRootOpAttribute(op);
diff --git a/iree/compiler/Dialect/Flow/Transforms/FusionOfTensorOps.cpp b/iree/compiler/Dialect/Flow/Transforms/FusionOfTensorOps.cpp
index 27b1607..02b3b25 100644
--- a/iree/compiler/Dialect/Flow/Transforms/FusionOfTensorOps.cpp
+++ b/iree/compiler/Dialect/Flow/Transforms/FusionOfTensorOps.cpp
@@ -79,12 +79,35 @@
consumer.getOwner()->operand_end());
if (operands.size() >= kIreeMaxOperandCount) return false;
- llvm::SmallDenseSet<Operation *, 4> numUsers;
- for (Operation *user : producer.getUsers()) {
- if (isa<linalg::GenericOp>(user)) continue;
- numUsers.insert(user);
+ bool isBroadcast = false;
+ if (auto genericOp =
+ dyn_cast<linalg::GenericOp>(producer.getOwner())) {
+ bool parallelOp =
+ llvm::all_of(genericOp.iterator_types(), [](Attribute attr) {
+ return attr.cast<StringAttr>().getValue() ==
+ getParallelIteratorTypeName();
+ });
+ if (parallelOp) {
+ for (OpOperand *opOperand : genericOp.getInputOperands()) {
+ AffineMap indexingMap = genericOp.getTiedIndexingMap(opOperand);
+ if (indexingMap.isProjectedPermutation() &&
+ indexingMap.getNumDims() != indexingMap.getNumResults()) {
+ isBroadcast = true;
+ break;
+ }
+ }
+ }
}
- return numUsers.empty();
+ // Only fuse if it has a single linalg generic user. It is a
+ // simplistic heuristic to avoid duplicating ops that may be
+ // expensive.
+ // TODO: Add a cost model to allow ops to be duplicated.
+ if (!isBroadcast && !isa<ConstantOp>(producer.getOwner()) &&
+ !llvm::hasSingleElement(producer.getUsers()))
+ return false;
+ return llvm::all_of(producer.getUsers(), [](Operation *user) {
+ return isa<linalg::GenericOp>(user);
+ });
};
// Simple heuristic to decide if reshaope should be folded in the linalg.
// If the source of the reshape is a linalg op fold to potentially allow the
diff --git a/iree/compiler/Dialect/Flow/Transforms/Passes.cpp b/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
index d005ef7..db00679 100644
--- a/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
+++ b/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
@@ -117,13 +117,13 @@
mlir::createLinalgFoldUnitExtentDimsPass());
passManager.addNestedPass<mlir::FuncOp>(createInterchangeGenericOpsPass());
passManager.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+ passManager.addPass(memref::createResolveShapedTypeResultDimsPass());
passManager.addNestedPass<mlir::FuncOp>(createFusionOfTensorOpsPass());
passManager.addNestedPass<mlir::FuncOp>(mlir::createCSEPass());
if (clEnableLinalgDetensorize) {
passManager.addNestedPass<mlir::FuncOp>(
mlir::createLinalgDetensorizePass());
}
- passManager.addPass(memref::createResolveShapedTypeResultDimsPass());
passManager.addNestedPass<mlir::FuncOp>(
createConvertToFlowBeforeDispatchFormation());
passManager.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
diff --git a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
index 0dc7904..2d35b8a 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
@@ -1073,3 +1073,26 @@
// CHECK: scf.for %[[X:.+]] =
// CHECK: %[[POOL:.+]] = linalg.pooling_nhwc_sum
// CHECK: flow.dispatch.tensor.store %[[POOL]], %[[OUTPUT]], offsets = [0, %[[Z]], %[[Y]], %[[X]]], sizes = [1, %{{.+}}, %{{.+}}, %{{.+}}]
+
+// -----
+
+func @named_op_outs_fusion(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+ %c0 = constant 0 : index
+ %c1 = constant 1 : index
+ %cst1 = constant -1.0 : f64
+ %cstm1 = constant 1.0 : f64
+ %c12345 = constant 12345 : i32
+ %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+ %d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+ %init = linalg.init_tensor [%d0, %d1] : tensor<?x?xf32>
+ %fill = linalg.fill_rng_2d ins(%cst1, %cstm1, %c12345 : f64, f64, i32)
+ outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
+ %matmul = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
+ outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
+ return %matmul : tensor<?x?xf32>
+}
+// CHECK-LABEL: func @named_op_outs_fusion
+// CHECK: flow.dispatch.workgroups
+// CHECK: %[[FILL:.+]] = linalg.fill_rng_2d
+// CHECK: linalg.matmul
+// CHECK-SAME: outs(%[[FILL]] : tensor<?x?xf32>)
diff --git a/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp b/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
index 8d63e0a..d6834d0 100644
--- a/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
+++ b/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
@@ -34,6 +34,12 @@
static llvm::cl::opt<bool> dumpPtx("iree-cuda-dump-ptx", llvm::cl::init(false),
llvm::cl::desc("Dump ptx"));
+static llvm::cl::opt<bool> clDisableLoopNounrollWa(
+ "iree-hal-cuda-disable-loop-nounroll-wa",
+ llvm::cl::desc(
+ "Disable the workaround for bug in ptxas for CUDA version before 11.4"),
+ llvm::cl::init(false));
+
namespace mlir {
namespace iree_compiler {
namespace IREE {
@@ -51,7 +57,9 @@
// the no unroll metadata. This bug is fixed in cuda 11.4 but since we still
// run on older driver we need to keep it.
// TODO(thomasraoux): Remove it once we stop supporting older drivers.
- codegenPasses.add(llvm::createSetNoUnrollPass());
+ if (!clDisableLoopNounrollWa) {
+ codegenPasses.add(llvm::createSetNoUnrollPass());
+ }
targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
llvm::CGFT_AssemblyFile);
codegenPasses.run(module);
diff --git a/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.cpp b/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.cpp
index a06b026..4ae4556 100644
--- a/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.cpp
+++ b/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.cpp
@@ -759,6 +759,69 @@
return tiledFftOp;
}
+//===----------------------------------------------------------------------===//
+// ReverseOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verifyReverseOp(ReverseOp op) {
+ if (op.getNumInputs()) {
+ return op.emitOpError("expected no inputs");
+ }
+ if (op.getNumOutputs() != 1) {
+ return op.emitOpError("expected exactly one output");
+ }
+
+ int64_t rank = op.getOperandRank();
+ int dimension = op.dimension();
+ if (dimension < 0 || dimension >= rank) {
+ return op.emitOpError("dimension must be within (0, ") << rank << "]";
+ }
+
+ return success();
+}
+
+bool ReverseOp::payloadUsesValueFromOperand(OpOperand *) { return false; }
+
+SmallVector<StringRef> ReverseOp::getLoopIteratorTypes() {
+ SmallVector<StringRef> iteratorTypes(getOperandRank(),
+ getParallelIteratorTypeName());
+ return iteratorTypes;
+}
+
+SmallVector<Range> ReverseOp::getLoopBounds(OpBuilder &builder) {
+ Location loc = getLoc();
+ Value zero = builder.create<ConstantIndexOp>(loc, 0);
+ Value one = builder.create<ConstantIndexOp>(loc, 1);
+ SmallVector<Range> ranges;
+ for (auto dim : llvm::seq<int64_t>(0, getOperandRank())) {
+ Value ub = getDimValue(builder, loc, operand(), dim);
+ ranges.emplace_back(Range{zero, ub, one});
+ }
+ auto dim = dimension();
+ ranges[dim].size = builder.create<SignedDivIOp>(
+ loc, ranges[dim].size, builder.create<ConstantIndexOp>(loc, 2));
+ return ranges;
+}
+
+LogicalResult ReverseOp::generateScalarImplementation(OpBuilder &b,
+ Location loc,
+ ValueRange ivs) {
+ SmallVector<Value> mirrorIndices(ivs.begin(), ivs.end());
+ auto dim = dimension();
+ auto size = getDimValue(b, loc, operand(), dim);
+ size = b.create<SubIOp>(loc, size, b.create<ConstantIndexOp>(loc, 1));
+ mirrorIndices[dim] = b.create<SubIOp>(loc, size, mirrorIndices[dim]);
+
+ // for (int i = 0; i < n / 2; ++i) {
+ // swap(array[i], array[n - 1 - i]);
+ // }
+ Value v1 = b.create<memref::LoadOp>(loc, operand(), ivs);
+ Value v2 = b.create<memref::LoadOp>(loc, operand(), mirrorIndices);
+ b.create<memref::StoreOp>(loc, v1, operand(), mirrorIndices);
+ b.create<memref::StoreOp>(loc, v2, operand(), ivs);
+ return success();
+}
+
#define DEFINE_OP_GET_EFFECTS(OP_NAME) \
void OP_NAME::getEffects( \
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>> \
@@ -772,6 +835,7 @@
DEFINE_OP_GET_EFFECTS(ScatterOp)
DEFINE_OP_GET_EFFECTS(SortOp)
DEFINE_OP_GET_EFFECTS(FftOp)
+DEFINE_OP_GET_EFFECTS(ReverseOp)
} // namespace linalg_ext
} // namespace iree_compiler
diff --git a/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td b/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td
index fe847fb..9841da1 100644
--- a/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td
+++ b/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td
@@ -245,6 +245,49 @@
}];
}
+def LinalgExt_ReverseOp : LinalgExt_Op<"reverse", [
+ DeclareOpInterfaceMethods<TiledOpInterface, ["generateScalarImplementation"]>,
+ DeclareOpInterfaceMethods<LinalgExtInterface,
+ // ReverseOp does not have a region, so we have to
+ // overwrite the method.
+ ["payloadUsesValueFromOperand"]>]> {
+ let summary = "Reverse operator";
+ let description = [{
+ A temporary solution of a reverse op. The loop bound of the reverse
+ dimension is half of the shape because we can simply swap elements. E.g.,
+
+ for (int i = 0; i < n / 2; ++i) {
+ std::swap(a[i], a[n - 1 - i]);
+ }
+ }];
+
+ let arguments = (ins Variadic<AnyShaped>:$inputs,
+ Variadic<AnyShaped>:$outputs,
+ I64Attr:$dimension
+ );
+ let results = (outs Variadic<AnyRankedTensor>:$results);
+ let assemblyFormat = [{
+ `dimension` `(` $dimension `)`
+ attr-dict (`ins` `(` $inputs^ `:` type($inputs) `)`)?
+ `outs` `(` $outputs `:` type($outputs) `)`
+ (`:` type($results)^)?
+ }];
+ let extraClassDeclaration = extraLinalgExtOpClassDeclaration # [{
+ Value operand() {
+ return getOutputOperand(0)->get();
+}
+ ShapedType getOperandType() {
+ return operand().getType().cast<ShapedType>();
+ }
+ int64_t getOperandRank() {
+ return getOperandType().getRank();
+ }
+ ArrayRef<int64_t> getOperandShape() {
+ return getOperandType().getShape();
+ }
+ }];
+}
+
//===----------------------------------------------------------------------===//
// Pure ops
//===----------------------------------------------------------------------===//
diff --git a/iree/compiler/Dialect/LinalgExt/IR/test/roundtrip.mlir b/iree/compiler/Dialect/LinalgExt/IR/test/roundtrip.mlir
index b2deb0a..e488bc1 100644
--- a/iree/compiler/Dialect/LinalgExt/IR/test/roundtrip.mlir
+++ b/iree/compiler/Dialect/LinalgExt/IR/test/roundtrip.mlir
@@ -387,3 +387,42 @@
// CHECK-SAME: outs(%[[REAL]], %[[IMAG]] : tensor<1024xf32>, tensor<1024xf32>)
// CHECK-SAME: : tensor<1024xf32>, tensor<1024xf32>
// CHECK: return %[[RES]]#0, %[[RES]]#1
+
+// -----
+
+func @reverse_tensor(%arg0: tensor<3x5xi32>) -> tensor<3x5xi32> {
+ %0 = linalg_ext.reverse
+ dimension(0)
+ outs(%arg0 : tensor<3x5xi32>) : tensor<3x5xi32>
+ return %0 : tensor<3x5xi32>
+}
+// CHECK-LABEL: func @reverse_tensor
+// CHECK-SAME: %[[ARG0:.+]]: tensor<3x5xi32>
+// CHECK: %[[RESULT:.+]] = linalg_ext.reverse dimension(0)
+// CHECK-SAME: outs(%[[ARG0]]
+
+// -----
+
+func @reverse_memref(%arg0: memref<3x5xi32>) {
+ linalg_ext.reverse
+ dimension(0)
+ outs(%arg0 : memref<3x5xi32>)
+ return
+}
+// CHECK-LABEL: func @reverse_memref
+// CHECK-SAME: %[[ARG0:.+]]: memref<3x5xi32>
+// CHECK: linalg_ext.reverse dimension(0)
+// CHECK-SAME: outs(%[[ARG0]]
+
+// -----
+
+func @reverse_dynamic_tensor(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {
+ %0 = linalg_ext.reverse
+ dimension(1)
+ outs(%arg0 : tensor<?x?xi32>) : tensor<?x?xi32>
+ return %0 : tensor<?x?xi32>
+}
+// CHECK-LABEL: func @reverse_dynamic_tensor
+// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?xi32>
+// CHECK: %[[RESULT:.+]] = linalg_ext.reverse dimension(1)
+// CHECK-SAME: outs(%[[ARG0]]
diff --git a/iree/compiler/Dialect/LinalgExt/Transforms/test/convert_to_loops.mlir b/iree/compiler/Dialect/LinalgExt/Transforms/test/convert_to_loops.mlir
index 423faca..84883b9 100644
--- a/iree/compiler/Dialect/LinalgExt/Transforms/test/convert_to_loops.mlir
+++ b/iree/compiler/Dialect/LinalgExt/Transforms/test/convert_to_loops.mlir
@@ -481,3 +481,29 @@
// CHECK: %[[RES3:.+]] = subf %[[L_REAL]], %[[T_REAL]]
// CHECK: %[[RES4:.+]] = subf %[[L_IMAG]], %[[T_IMAG]]
// CHECK: linalg.yield %[[RES1]], %[[RES2]], %[[RES3]], %[[RES4]]
+
+// -----
+
+func @reverse_dim_0(%arg0: memref<?x?xi32>) {
+ linalg_ext.reverse
+ dimension(0)
+ outs(%arg0 : memref<?x?xi32>)
+ return
+}
+// CHECK-LABEL: func @reverse_dim_0
+// CHECK-SAME: %[[BUF:[a-zA-Z0-9]+]]
+// CHECK-DAG: %[[C0:.+]] = constant 0 : index
+// CHECK-DAG: %[[C1:.+]] = constant 1 : index
+// CHECK-DAG: %[[C2:.+]] = constant 2 : index
+// CHECK-DAG: %[[D0:.+]] = memref.dim %arg0, %c0 : memref<?x?xi32>
+// CHECK-DAG: %[[D1:.+]] = memref.dim %arg0, %c1 : memref<?x?xi32>
+// CHECK-DAG: %[[REV_UB:.+]] = divi_signed %[[D0]], %[[C2]] : index
+// CHECK: scf.for %[[I:.+]] = %[[C0]] to %[[REV_UB]] step %[[C1]]
+// CHECK: scf.for %[[J:.+]] = %[[C0]] to %[[D1]] step %[[C1]]
+// CHECK: %[[T0:.+]] = memref.dim %[[BUF]], %[[C0]]
+// CHECK: %[[T1:.+]] = subi %[[T0]], %[[C1]] : index
+// CHECK: %[[T2:.+]] = subi %[[T1]], %[[I]] : index
+// CHECK: %[[V0:.+]] = memref.load %[[BUF]][%[[I]], %[[J]]]
+// CHECK: %[[V1:.+]] = memref.load %[[BUF]][%[[T2]], %[[J]]]
+// CHECK: memref.store %[[V0]], %[[BUF]][%[[T2]], %[[J]]] : memref<?x?xi32>
+// CHECK: memref.store %[[V1]], %[[BUF]][%[[I]], %[[J]]] : memref<?x?xi32>