Revert commits following a bad test deactivation (#12147)
There seems to be a legit error lurking that I do not understand atm.
diff --git a/compiler/src/iree/compiler/Codegen/Common/BUILD b/compiler/src/iree/compiler/Codegen/Common/BUILD
index d6ba6c8..0d483a0 100644
--- a/compiler/src/iree/compiler/Codegen/Common/BUILD
+++ b/compiler/src/iree/compiler/Codegen/Common/BUILD
@@ -97,7 +97,6 @@
"//compiler/src/iree/compiler/Codegen/LLVMCPU/TransformExtensions:LLVMCPUExtensions",
"//compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions:LLVMGPUExtensions",
"@llvm-project//mlir:AffineTransformOps",
- "@llvm-project//mlir:BufferizationTransformOps",
"@llvm-project//mlir:GPUTransformOps",
"@llvm-project//mlir:LinalgTransformOps",
"@llvm-project//mlir:MemRefTransformOps",
diff --git a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
index ddf494a..d722594 100644
--- a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
@@ -39,7 +39,6 @@
MLIRArithUtils
MLIRAsyncDialect
MLIRBufferizationDialect
- MLIRBufferizationTransformOps
MLIRBufferizationTransforms
MLIRFuncDialect
MLIRGPUOps
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
index d89e6d1..a6207ac 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
@@ -21,7 +21,6 @@
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h"
#include "mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
@@ -100,7 +99,6 @@
iree_compiler::registerTransformDialectLLVMCPUExtension(registry);
iree_compiler::registerTransformDialectLLVMGPUExtension(registry);
affine::registerTransformDialectExtension(registry);
- bufferization::registerTransformDialectExtension(registry);
gpu::registerTransformDialectExtension(registry);
linalg::registerTransformDialectExtension(registry);
memref::registerTransformDialectExtension(registry);
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
index b281ed8..62b7a42 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
@@ -14,7 +14,6 @@
#include "iree/compiler/Codegen/Common/Transforms.h"
#include "iree/compiler/Codegen/Interfaces/BufferizationInterfaces.h"
#include "iree/compiler/Codegen/Passes.h"
-#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
@@ -87,7 +86,6 @@
ADD_PATTERN(swapPaddingElideConditional,
getSwapPaddingElideConditionalAttrName)
ADD_PATTERN(swappingPatterns, getSwappingPatternsAttrName)
- ADD_PATTERN(unrollVectorsGpuMma, getUnrollVectorsGpuMmaAttrName)
#undef ADD_PATTERN
result.addTypes({pdl::OperationType::get(ctx)});
}
@@ -204,23 +202,6 @@
});
}
-static Optional<SmallVector<int64_t>> getGPUTensorCoreNativeVectorSize(
- Operation *op) {
- return getWmmaNativeVectorSize(op);
-}
-
-static void addUnrollVectorsGpuMmaPatterns(RewritePatternSet &patterns) {
- auto unrollOrder = [](Operation *op) -> Optional<SmallVector<int64_t>> {
- auto contract = dyn_cast<vector::ContractionOp>(op);
- if (!contract) return std::nullopt;
- return mlir::iree_compiler::gpuMmaUnrollOrder(contract);
- };
- vector::populateVectorUnrollPatterns(
- patterns, vector::UnrollVectorOptions()
- .setNativeShapeFn(getGPUTensorCoreNativeVectorSize)
- .setUnrollTraversalOrderFn(unrollOrder));
-}
-
static void addAdditionalIreePatterns(RewritePatternSet &patterns) {
patterns.add<GenerateToConstant>(patterns.getContext());
}
@@ -265,7 +246,6 @@
linalg::populateFoldReshapeOpsByExpansionPatterns(
patterns, [](OpOperand *) { return true; });
}
- if (getUnrollVectorsGpuMma()) addUnrollVectorsGpuMmaPatterns(patterns);
TrackingListener listener(state);
GreedyRewriteConfig config;
@@ -790,22 +770,12 @@
transform::TransformResults &transformResults,
transform::TransformState &state) {
ArrayRef<Operation *> targetOps = state.getPayloadOps(getTarget());
- if (targetOps.empty()) {
- transformResults.set(getForeachThreadOp().cast<OpResult>(), {});
- transformResults.set(getTiledOp().cast<OpResult>(), {});
- return DiagnosedSilenceableFailure::success();
- }
- if (targetOps.size() != 1) {
- return mlir::emitDefiniteFailure(
- state.getTopLevel(),
- "expected single target op in payload, got: ")
- << targetOps.size();
- }
+ assert(targetOps.size() == 1 && "expected single target op in payload");
auto funcOp = targetOps.front()->getParentOfType<func::FuncOp>();
FailureOr<IREE::HAL::ExecutableExportOp> exportOp = getEntryPoint(funcOp);
if (failed(exportOp)) {
- return mlir::emitDefiniteFailure(state.getTopLevel(),
- "couldn't find export op for func");
+ return mlir::emitDefiniteFailure(
+ state.getTopLevel(), "couldn't find top level HAL export op for func");
}
/// Lower the workgroup count region in keeping with the way dispatch
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h
index e268c4e..87cea4f 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h
@@ -50,7 +50,6 @@
bool rewritePackOps = false;
bool swapPaddingElideConditional = false;
bool swappingPatterns = false;
- bool unrollVectorsGpuMma = false;
};
} // namespace transform_dialect
} // namespace IREE
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td
index 52f535d..1dcd352 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td
@@ -68,10 +68,8 @@
tensor.extract_slice swapping pattern. This injects static information
that guarantees padding is smaller than the window size which guarantees
we never see a tile comprised of padding-only.
- - unroll_vectors_gpu_mma: adds patterns that unroll vectors to a native tile
- size for GPUs with mma operations. The size is currently hardcoded but
- should be refactored upstream and made pluggable.
-
+ This allows dropping the generation or an annoying internal scf.if but may
+ yield incorrect code in pathological cases.
#### Return modes:
@@ -100,8 +98,7 @@
UnitAttr:$rank_reducing_vector,
UnitAttr:$rewrite_pack_ops,
UnitAttr:$swap_padding_elide_conditional,
- UnitAttr:$swapping_patterns,
- UnitAttr:$unroll_vectors_gpu_mma);
+ UnitAttr:$swapping_patterns);
let results = (outs PDL_Operation:$result);
let assemblyFormat = "$target attr-dict";
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/BUILD b/compiler/src/iree/compiler/Codegen/Interfaces/BUILD
index 40c37ba..a6b366f 100644
--- a/compiler/src/iree/compiler/Codegen/Interfaces/BUILD
+++ b/compiler/src/iree/compiler/Codegen/Interfaces/BUILD
@@ -55,7 +55,6 @@
"//compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions:LLVMGPUExtensions",
"//compiler/src/iree/compiler/Dialect/Flow/TransformExtensions:FlowExtensions",
"@llvm-project//mlir:AffineTransformOps",
- "@llvm-project//mlir:BufferizationTransformOps",
"@llvm-project//mlir:GPUTransformOps",
"@llvm-project//mlir:LinalgTransformOps",
"@llvm-project//mlir:MemRefTransformOps",
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt
index 8179315..05701b4 100644
--- a/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt
@@ -24,7 +24,6 @@
IREELinalgExtTransformOps
IREELinalgTransformDialect
MLIRAffineTransformOps
- MLIRBufferizationTransformOps
MLIRGPUTransformOps
MLIRIR
MLIRLinalgTransformOps
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/Interfaces.cpp b/compiler/src/iree/compiler/Codegen/Interfaces/Interfaces.cpp
index ea299b6..dae53e8 100644
--- a/compiler/src/iree/compiler/Codegen/Interfaces/Interfaces.cpp
+++ b/compiler/src/iree/compiler/Codegen/Interfaces/Interfaces.cpp
@@ -18,7 +18,6 @@
#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h"
#include "iree/compiler/Dialect/Flow/TransformExtensions/FlowExtensions.h"
#include "mlir/Dialect/Affine/TransformOps/AffineTransformOps.h"
-#include "mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h"
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
@@ -45,7 +44,6 @@
linalg::registerTilingInterfaceExternalModels(registry);
affine::registerTransformDialectExtension(registry);
- bufferization::registerTransformDialectExtension(registry);
gpu::registerTransformDialectExtension(registry);
linalg::registerTransformDialectExtension(registry);
memref::registerTransformDialectExtension(registry);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
index 73b167e..91c1b33 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
@@ -46,6 +46,38 @@
vector::populateVectorReductionToContractPatterns(patterns);
}
+static Optional<SmallVector<int64_t>> unrollOrder(Operation *op) {
+ auto contract = dyn_cast<vector::ContractionOp>(op);
+ if (!contract) return std::nullopt;
+ SmallVector<int64_t> order;
+ // Pick an unrolling order that will allow tensorcore operation to reuse LHS
+ // register. This is needed to get good performance on sm_80 target.
+ // First make reduction the outer dimensions.
+ for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
+ if (vector::isReductionIterator(iter)) {
+ order.push_back(index);
+ }
+ }
+
+ llvm::SmallDenseSet<int64_t> dims;
+ for (AffineExpr expr : contract.getIndexingMapsArray()[0].getResults()) {
+ dims.insert(expr.cast<AffineDimExpr>().getPosition());
+ }
+ // Then parallel dimensions that are part of Lhs as we want to re-use Lhs.
+ for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
+ if (vector::isParallelIterator(iter) && dims.count(index)) {
+ order.push_back(index);
+ }
+ }
+ // Then the remaining parallel loops.
+ for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
+ if (vector::isParallelIterator(iter) && !dims.count(index)) {
+ order.push_back(index);
+ }
+ }
+ return order;
+}
+
/// Returns vector::ContractionOp operand's index where the result is used.
static Optional<int> getVectorContractOpOperandId(
vector::ContractionOp contractOp, OpResult result) {
@@ -198,11 +230,6 @@
}
static void populateVectorUnrollPatterns(RewritePatternSet &patterns) {
- auto unrollOrder = [](Operation *op) -> Optional<SmallVector<int64_t>> {
- auto contract = dyn_cast<vector::ContractionOp>(op);
- if (!contract) return std::nullopt;
- return gpuMmaUnrollOrder(contract);
- };
vector::populateVectorUnrollPatterns(
patterns, vector::UnrollVectorOptions()
.setNativeShapeFn(getGPUTensorCoreNativeVectorSize)
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 9e5efa6..f04a064 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -104,12 +104,12 @@
auto &nestedModulePM = pm.nest<ModuleOp>();
nestedModulePM.addNestedPass<func::FuncOp>(
+ createConvertToDestinationPassingStylePass(
+ useWARForCooperativeMatrixCodegen));
+ nestedModulePM.addNestedPass<func::FuncOp>(
IREE::LinalgExt::createTileAndDecomposeAttentionPass());
nestedModulePM.addNestedPass<func::FuncOp>(
IREE::LinalgExt::createDecomposeSoftmaxPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createConvertToDestinationPassingStylePass(
- useWARForCooperativeMatrixCodegen));
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
}
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
index eb87340..4cb6d99 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
@@ -112,37 +112,6 @@
return threadsAvailable == 1;
}
-/// Pick an unrolling order that will allow tensorcore operation to reuse LHS
-/// register. This is needed to get good performance on sm_80 target.
-Optional<SmallVector<int64_t>> gpuMmaUnrollOrder(
- vector::ContractionOp contract) {
- SmallVector<int64_t> order;
- // First make reduction the outer dimensions.
- for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
- if (vector::isReductionIterator(iter)) {
- order.push_back(index);
- }
- }
-
- llvm::SmallDenseSet<int64_t> dims;
- for (AffineExpr expr : contract.getIndexingMapsArray()[0].getResults()) {
- dims.insert(expr.cast<AffineDimExpr>().getPosition());
- }
- // Then parallel dimensions that are part of Lhs as we want to re-use Lhs.
- for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
- if (vector::isParallelIterator(iter) && dims.count(index)) {
- order.push_back(index);
- }
- }
- // Then the remaining parallel loops.
- for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
- if (vector::isParallelIterator(iter) && !dims.count(index)) {
- order.push_back(index);
- }
- }
- return order;
-}
-
//===----------------------------------------------------------------------===//
// GPU workgroup memory
//===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
index ace232c..10e4838 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
@@ -48,11 +48,6 @@
int64_t threadCount,
int64_t vectorSize);
-/// Pick an unrolling order that will allow tensorcore operation to reuse LHS
-/// register. This is needed to get good performance on sm_80 target.
-Optional<SmallVector<int64_t>> gpuMmaUnrollOrder(
- vector::ContractionOp contract);
-
//===----------------------------------------------------------------------===//
// GPU workgroup memory
//===----------------------------------------------------------------------===//
diff --git a/tests/e2e/linalg_ext_ops/BUILD b/tests/e2e/linalg_ext_ops/BUILD
index ae8bd96..9966915 100644
--- a/tests/e2e/linalg_ext_ops/BUILD
+++ b/tests/e2e/linalg_ext_ops/BUILD
@@ -18,10 +18,7 @@
srcs = enforce_glob(
# keep sorted
[
- # attention.mlir is broken at HEAD and blocks commits.
- # https://github.com/iree-org/iree/issues/12129
- # reactivate when truly fixed.
- # "attention.mlir",
+ "attention.mlir",
"reverse.mlir",
"scan.mlir",
"scatter.mlir",
@@ -31,7 +28,6 @@
],
include = ["*.mlir"],
exclude = [
- "attention.mlir",
"pack.mlir",
"unpack.mlir",
"winograd_input.mlir",
@@ -99,10 +95,7 @@
srcs = enforce_glob(
# keep sorted
[
- # attention.mlir is broken at HEAD and blocks commits.
- # https://github.com/iree-org/iree/issues/12129
- # reactivate when truly fixed.
- # "attention.mlir",
+ "attention.mlir",
"pack.mlir",
"reverse.mlir",
"scan.mlir",
@@ -115,9 +108,6 @@
"winograd_output.mlir",
],
include = ["*.mlir"],
- exclude = [
- "attention.mlir",
- ],
),
driver = "local-task",
target_backend = "llvm-cpu",
diff --git a/tests/e2e/linalg_ext_ops/CMakeLists.txt b/tests/e2e/linalg_ext_ops/CMakeLists.txt
index d209465..46a8b04 100644
--- a/tests/e2e/linalg_ext_ops/CMakeLists.txt
+++ b/tests/e2e/linalg_ext_ops/CMakeLists.txt
@@ -14,6 +14,7 @@
NAME
check_cuda
SRCS
+ "attention.mlir"
"reverse.mlir"
"scan.mlir"
"scatter.mlir"
@@ -77,6 +78,7 @@
NAME
check_llvm-cpu_local-task
SRCS
+ "attention.mlir"
"pack.mlir"
"reverse.mlir"
"scan.mlir"