Integrate LLVM at https://github.com/llvm/llvm-project/commit/161255eb059e36f40136e26c7eb2ab92ac98ef9d (#12822)
MHLO : https://github.com/tensorflow/mlir-hlo/commit/926e62f88435b5098054f9cb0b0eecb900e71e96
TensorFlow : https://github.com/tensorflow/tensorflow/commit/0a8fd11e73559d9bf0e547343f08f648981e2ff7
Co-authored-by: Lei Zhang <antiagainst@google.com>
Co-authored-by: Mahesh Ravishankar <ravishankarm@google.com>
diff --git a/compiler/src/iree/compiler/API/api_exports.c b/compiler/src/iree/compiler/API/api_exports.c
index 7cff00b..15ada99 100644
--- a/compiler/src/iree/compiler/API/api_exports.c
+++ b/compiler/src/iree/compiler/API/api_exports.c
@@ -305,6 +305,7 @@
extern void mlirF64TypeGet();
extern void mlirFlatSymbolRefAttrGet();
extern void mlirFlatSymbolRefAttrGetValue();
+extern void mlirFloat8E4M3B11FNUZTypeGet();
extern void mlirFloat8E4M3FNTypeGet();
extern void mlirFloat8E4M3FNUZTypeGet();
extern void mlirFloat8E5M2FNUZTypeGet();
@@ -540,6 +541,7 @@
extern void mlirTypeIsAF16();
extern void mlirTypeIsAF32();
extern void mlirTypeIsAF64();
+extern void mlirTypeIsAFloat8E4M3B11FNUZ();
extern void mlirTypeIsAFloat8E4M3FN();
extern void mlirTypeIsAFloat8E4M3FNUZ();
extern void mlirTypeIsAFloat8E5M2();
@@ -893,6 +895,7 @@
x += (uintptr_t)&mlirF64TypeGet;
x += (uintptr_t)&mlirFlatSymbolRefAttrGet;
x += (uintptr_t)&mlirFlatSymbolRefAttrGetValue;
+ x += (uintptr_t)&mlirFloat8E4M3B11FNUZTypeGet;
x += (uintptr_t)&mlirFloat8E4M3FNTypeGet;
x += (uintptr_t)&mlirFloat8E4M3FNUZTypeGet;
x += (uintptr_t)&mlirFloat8E5M2FNUZTypeGet;
@@ -1128,6 +1131,7 @@
x += (uintptr_t)&mlirTypeIsAF16;
x += (uintptr_t)&mlirTypeIsAF32;
x += (uintptr_t)&mlirTypeIsAF64;
+ x += (uintptr_t)&mlirTypeIsAFloat8E4M3B11FNUZ;
x += (uintptr_t)&mlirTypeIsAFloat8E4M3FN;
x += (uintptr_t)&mlirTypeIsAFloat8E4M3FNUZ;
x += (uintptr_t)&mlirTypeIsAFloat8E5M2;
diff --git a/compiler/src/iree/compiler/API/api_exports.def b/compiler/src/iree/compiler/API/api_exports.def
index 06e7d0f..95429a1 100644
--- a/compiler/src/iree/compiler/API/api_exports.def
+++ b/compiler/src/iree/compiler/API/api_exports.def
@@ -297,6 +297,7 @@
mlirF64TypeGet
mlirFlatSymbolRefAttrGet
mlirFlatSymbolRefAttrGetValue
+ mlirFloat8E4M3B11FNUZTypeGet
mlirFloat8E4M3FNTypeGet
mlirFloat8E4M3FNUZTypeGet
mlirFloat8E5M2FNUZTypeGet
@@ -532,6 +533,7 @@
mlirTypeIsAF16
mlirTypeIsAF32
mlirTypeIsAF64
+ mlirTypeIsAFloat8E4M3B11FNUZ
mlirTypeIsAFloat8E4M3FN
mlirTypeIsAFloat8E4M3FNUZ
mlirTypeIsAFloat8E5M2
diff --git a/compiler/src/iree/compiler/API/api_exports.ld b/compiler/src/iree/compiler/API/api_exports.ld
index d96c51a..a1766ef 100644
--- a/compiler/src/iree/compiler/API/api_exports.ld
+++ b/compiler/src/iree/compiler/API/api_exports.ld
@@ -298,6 +298,7 @@
mlirF64TypeGet;
mlirFlatSymbolRefAttrGet;
mlirFlatSymbolRefAttrGetValue;
+ mlirFloat8E4M3B11FNUZTypeGet;
mlirFloat8E4M3FNTypeGet;
mlirFloat8E4M3FNUZTypeGet;
mlirFloat8E5M2FNUZTypeGet;
@@ -533,6 +534,7 @@
mlirTypeIsAF16;
mlirTypeIsAF32;
mlirTypeIsAF64;
+ mlirTypeIsAFloat8E4M3B11FNUZ;
mlirTypeIsAFloat8E4M3FN;
mlirTypeIsAFloat8E4M3FNUZ;
mlirTypeIsAFloat8E5M2;
diff --git a/compiler/src/iree/compiler/API/api_exports.macos.lst b/compiler/src/iree/compiler/API/api_exports.macos.lst
index 4f0b1f7..81d1668 100644
--- a/compiler/src/iree/compiler/API/api_exports.macos.lst
+++ b/compiler/src/iree/compiler/API/api_exports.macos.lst
@@ -296,6 +296,7 @@
_mlirF64TypeGet
_mlirFlatSymbolRefAttrGet
_mlirFlatSymbolRefAttrGetValue
+_mlirFloat8E4M3B11FNUZTypeGet
_mlirFloat8E4M3FNTypeGet
_mlirFloat8E4M3FNUZTypeGet
_mlirFloat8E5M2FNUZTypeGet
@@ -531,6 +532,7 @@
_mlirTypeIsAF16
_mlirTypeIsAF32
_mlirTypeIsAF64
+_mlirTypeIsAFloat8E4M3B11FNUZ
_mlirTypeIsAFloat8E4M3FN
_mlirTypeIsAFloat8E4M3FNUZ
_mlirTypeIsAFloat8E5M2
diff --git a/compiler/src/iree/compiler/Codegen/Common/FoldAffineMinInDistributedLoops.cpp b/compiler/src/iree/compiler/Codegen/Common/FoldAffineMinInDistributedLoops.cpp
index 3f44d4f..eb040ac 100644
--- a/compiler/src/iree/compiler/Codegen/Common/FoldAffineMinInDistributedLoops.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/FoldAffineMinInDistributedLoops.cpp
@@ -143,8 +143,8 @@
unsigned index = idOp.getDimension().getZExtValue();
if (index >= numWorkgroup.size()) return failure();
constraints.appendDimVar({idOp});
- constraints.addBound(FlatAffineRelation::BoundType::LB, idOp, 0);
- constraints.addBound(FlatAffineRelation::BoundType::UB, idOp,
+ constraints.addBound(presburger::BoundType::LB, idOp, 0);
+ constraints.addBound(presburger::BoundType::UB, idOp,
numWorkgroup[index] - 1);
}
return canonicalizeMinMaxOp(rewriter, minOp, constraints);
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPUVectorization.cpp b/compiler/src/iree/compiler/Codegen/Common/GPUVectorization.cpp
index fda438f..05f59da 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPUVectorization.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPUVectorization.cpp
@@ -15,6 +15,7 @@
#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/Passes.h"
diff --git a/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp b/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
index 1897a6a..b73a182 100644
--- a/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
@@ -112,7 +112,8 @@
// TODO(thomasraoux): Remove it once the fix is merged.
loopInvariantCodeMotion(funcOp);
linalg::hoistRedundantVectorTransfers(funcOp);
- vector::transferOpflowOpt(funcOp);
+ IRRewriter rewriter(funcOp->getContext());
+ vector::transferOpflowOpt(rewriter, funcOp);
// Move bitcast inwards from loop region boundaries to increase chances to
// cancel them.
diff --git a/compiler/src/iree/compiler/Codegen/Common/SplitFullPartialTransferPass.cpp b/compiler/src/iree/compiler/Codegen/Common/SplitFullPartialTransferPass.cpp
index bc3b741..177d268 100644
--- a/compiler/src/iree/compiler/Codegen/Common/SplitFullPartialTransferPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/SplitFullPartialTransferPass.cpp
@@ -8,6 +8,7 @@
#include "iree/compiler/Codegen/PassDetail.h"
#include "iree/compiler/Codegen/Passes.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Pass/Pass.h"
@@ -36,7 +37,7 @@
.Case("vector-transfers",
vector::VectorTransferSplit::VectorTransfer)
.Default(vector::VectorTransferSplit::None));
- patterns.add<vector::VectorTransferFullPartialRewriter>(ctx, options);
+ populateVectorTransferFullPartialPatterns(patterns, options);
if (failed(applyPatternsAndFoldGreedily(getOperation(),
std::move(patterns)))) {
return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
index e0c347c..d38ce0f 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
@@ -39,6 +39,7 @@
#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Dialect/Vector/Transforms/Passes.h"
#include "mlir/IR/Diagnostics.h"
#include "mlir/Pass/PassManager.h"
@@ -85,14 +86,15 @@
// Track temporary allocations that are never read from. If this is the case
// it means both the allocations and associated stores can be removed.
-static void eraseDeadAllocAndStores(Operation *parentOp) {
+static void eraseDeadAllocAndStores(RewriterBase &rewriter,
+ Operation *parentOp) {
std::vector<Operation *> opToErase;
parentOp->walk([&](memref::AllocOp op) {
if (allUsesAreStores(op, opToErase)) {
opToErase.push_back(op.getOperation());
}
});
- for (Operation *op : opToErase) op->erase();
+ for (Operation *op : opToErase) rewriter.eraseOp(op);
}
//===---------------------------------------------------------------------===//
@@ -103,9 +105,12 @@
Operation *target, transform::ApplyToEachResultList &results,
transform::TransformState &state) {
// Apply store to load forwarding and dead store elimination.
- vector::transferOpflowOpt(target);
- eraseDeadAllocAndStores(target);
- return DiagnosedSilenceableFailure::success();
+ IRRewriter rewriter(target->getContext());
+ TrackingListener listener(state);
+ rewriter.setListener(&listener);
+ vector::transferOpflowOpt(rewriter, target);
+ eraseDeadAllocAndStores(rewriter, target);
+ return listener.check(target->getLoc());
}
void transform_dialect::ApplyBufferOptimizationsOp::getEffects(
diff --git a/compiler/src/iree/compiler/Codegen/Common/VectorReductionToGPU.cpp b/compiler/src/iree/compiler/Codegen/Common/VectorReductionToGPU.cpp
index 32ba18d..fe6e242 100644
--- a/compiler/src/iree/compiler/Codegen/Common/VectorReductionToGPU.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/VectorReductionToGPU.cpp
@@ -14,6 +14,7 @@
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
diff --git a/compiler/src/iree/compiler/Codegen/Common/VectorizePackUnPackOps.cpp b/compiler/src/iree/compiler/Codegen/Common/VectorizePackUnPackOps.cpp
index 4c7c9d7..624fcc3 100644
--- a/compiler/src/iree/compiler/Codegen/Common/VectorizePackUnPackOps.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/VectorizePackUnPackOps.cpp
@@ -15,6 +15,8 @@
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/PatternMatch.h"
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/flatten_memref_subspan.mlir b/compiler/src/iree/compiler/Codegen/Common/test/flatten_memref_subspan.mlir
index bb21463..2cc5291 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/flatten_memref_subspan.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/flatten_memref_subspan.mlir
@@ -7,13 +7,13 @@
}
// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 336)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 56 + s1 * 8 + s2 + s3 floordiv 4)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 56 + s2 * 8 + s3 + s0 floordiv 4)>
//CHECK-LABEL: func.func @load_subspan_with_offset
// CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index)
// CHECK-DAG: %[[ZERO:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]]
// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[ZERO]]) : memref<?xf32>{%[[SIZE]]}
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[I2]], %[[OFFSET]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]]
// CHECK: %[[LOAD:.+]] = memref.load %[[SUBSPAN]][%[[INDEX]]]
// CHECK: return %[[LOAD]]
@@ -26,13 +26,13 @@
}
// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 24)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 12 + s1 * 4 + s2 + s3 floordiv 4)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 12 + s2 * 4 + s3 + s0 floordiv 4)>
//CHECK-LABEL: func.func @store_subspan_with_offset
// CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index)
// CHECK-DAG: %[[ZERO:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]
// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[ZERO]]) : memref<?xf32>{%[[SIZE]]}
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[I2]], %[[OFFSET]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]]
// CHECK: memref.store %[[VALUE]], %[[SUBSPAN]][%[[INDEX]]] : memref<?xf32>
// -----
@@ -43,7 +43,7 @@
return %val: vector<4xf32>
}
-// CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 56 + s1 * 8 + s2 + s3 floordiv 16)>
+// CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 56 + s2 * 8 + s3 + s0 floordiv 16)>
//CHECK-LABEL: func.func @load_subspan_with_vector_element
// CHECK: affine.apply #[[$MAP]]()
@@ -55,7 +55,7 @@
return %val: f16
}
-// CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 56 + s1 * 8 + s2 + s3 floordiv 2)>
+// CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 56 + s2 * 8 + s3 + s0 floordiv 2)>
//CHECK-LABEL: func.func @load_subspan_with_16bit_element
// CHECK: affine.apply #[[$MAP]]()
@@ -69,14 +69,14 @@
}
// CHECK: #[[$SIZE_MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 12 + s1 floordiv 4)
-// CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 12 + s1 * 4 + s2 + s3 floordiv 4)>
+// CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 12 + s2 * 4 + s3 + s0 floordiv 4)>
//CHECK-LABEL: func.func @store_subspan_with_leading_dynamic_dim
// CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index)
// CHECK: %[[C0:.+]] = arith.constant 0 : index
// CHECK: %[[DIM:.+]] = hal.interface.constant.load[0] : index
// CHECK: %[[SIZE:.+]] = affine.apply #[[$SIZE_MAP]]()[%[[DIM]], %[[OFFSET]]]
// CHECK: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[I0]], %[[I1]], %[[I2]], %[[OFFSET]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]]
// CHECK: memref.store %[[VALUE]], %[[DST]][%[[INDEX]]] : memref<?xf32>
// -----
@@ -93,7 +93,7 @@
}
// CHECK: #[[$SIZE_MAP:.+]] = affine_map<()[s0, s1, s2, s3, s4] -> (((s0 * s1) * s2) * s3 + s4 floordiv 4)>
-// CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7] -> (s1 + s2 floordiv 4 + (s4 + (s7 + s5 * s6) * s3) * s0)>
+// CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7] -> (s0 floordiv 4 + s2 + (s4 + (s7 + s5 * s6) * s3) * s1)>
//CHECK-LABEL: func.func @store_subspan_with_all_dynamic_dim
// CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index, %[[I3:.+]]: index)
// CHECK: %[[C0:.+]] = arith.constant 0 : index
@@ -103,7 +103,7 @@
// CHECK: %[[DIM3:.+]] = hal.interface.constant.load[3] : index
// CHECK: %[[SIZE:.+]] = affine.apply #[[$SIZE_MAP]]()[%[[DIM0]], %[[DIM1]], %[[DIM2]], %[[DIM3]], %[[OFFSET]]]
// CHECK: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[DIM3]], %[[I3]], %[[OFFSET]], %[[DIM2]], %[[I2]], %[[I0]], %[[DIM1]], %[[I1]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[OFFSET]], %[[DIM3]], %[[I3]], %[[DIM2]], %[[I2]], %[[I0]], %[[DIM1]], %[[I1]]]
// CHECK: memref.store %[[VALUE]], %[[DST]][%[[INDEX]]]
// -----
@@ -117,7 +117,7 @@
}
// CHECK: #[[$SIZE_MAP:.+]] = affine_map<()[s0, s1, s2] -> ((s0 * s1) * 32 + s2 floordiv 4)>
-// CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0 + s1 floordiv 4 + s3 * 8 + ((s4 * 4 + s5) * s2) * 8)>
+// CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0 floordiv 4 + s1 + s3 * 8 + ((s4 * 4 + s5) * s2) * 8)>
//CHECK-LABEL: func.func @store_subspan_with_mixed_dynamic_dim
// CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index, %[[I3:.+]]: index)
// CHECK: %[[C0:.+]] = arith.constant 0 : index
@@ -125,7 +125,7 @@
// CHECK: %[[DIM2:.+]] = hal.interface.constant.load[1] : index
// CHECK: %[[SIZE:.+]] = affine.apply #[[$SIZE_MAP]]()[%[[DIM0]], %[[DIM2]], %[[OFFSET]]]
// CHECK: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[I3]], %[[OFFSET]], %[[DIM2]], %[[I2]], %[[I0]], %[[I1]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[OFFSET]], %[[I3]], %[[DIM2]], %[[I2]], %[[I0]], %[[I1]]]
// CHECK: memref.store %[[VALUE]], %[[DST]][%[[INDEX]]]
// -----
@@ -140,7 +140,7 @@
}
// CHECK: #[[$SIZE_MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 12 + s1 floordiv 4)
-// CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 12 + s1 * 4 + s2 + s3 floordiv 4)>
+// CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 12 + s2 * 4 + s3 + s0 floordiv 4)>
//CHECK-LABEL: func.func @store_subspan_with_flow_control
// CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index)
// CHECK: %[[C0:.+]] = arith.constant 0 : index
@@ -148,7 +148,7 @@
// CHECK: %[[SIZE:.+]] = affine.apply #[[$SIZE_MAP]]()[%[[DIM]], %[[OFFSET]]]
// CHECK: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
// CHECK: scf.for
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[I0]], %[[I1]], %[[I2]], %[[OFFSET]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]]
// CHECK: memref.store %[[VALUE]], %[[DST]][%[[INDEX]]] : memref<?xf32>
// -----
@@ -221,7 +221,7 @@
}
// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 336)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 56 + s1 * 8 + s2 + s3 floordiv 4)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 56 + s2 * 8 + s3 + s0 floordiv 4)>
//CHECK-LABEL: func.func @transfer_read_subspan_with_offset
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index
@@ -230,7 +230,7 @@
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[ARG0]]]
// CHECK: %[[MEMREF:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG0]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]]
// CHECK: %[[VEC:.+]] = vector.transfer_read %[[MEMREF]][%[[INDEX]]]
// CHECK: return %[[VEC]]
@@ -244,7 +244,7 @@
}
// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 336)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 56 + s1 * 8 + s2 + s3 floordiv 4)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 56 + s2 * 8 + s3 + s0 floordiv 4)>
//CHECK-LABEL: func.func @transfer_write_subspan_with_offset
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index
@@ -254,7 +254,7 @@
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[ARG0]]]
// CHECK: %[[MEMREF:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG0]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]]
// CHECK: vector.transfer_write %[[ARG4]], %[[MEMREF]][%[[INDEX]]]
// -----
@@ -333,13 +333,13 @@
}
// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 840)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2] -> (s0 * 42 + s1 + s2 floordiv 4)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2] -> (s1 * 42 + s2 + s0 floordiv 4)>
//CHECK-LABEL: func.func @collapse_shape
// CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index)
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]]
// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[OFFSET]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]]]
// CHECK: memref.load %[[SUBSPAN]][%[[INDEX]]]
// -----
@@ -352,13 +352,13 @@
}
// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 840)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3, s4] -> (s0 * 210 + s1 * 42 + s2 * 7 + s3 + s4 floordiv 4)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3, s4] -> (s1 * 210 + s2 * 42 + s3 * 7 + s4 + s0 floordiv 4)>
//CHECK-LABEL: func.func @expand_shape
// CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index, %[[I3:.+]]: index)
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]]
// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[I2]], %[[I3]], %[[OFFSET]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]], %[[I3]]]
// CHECK: memref.load %[[SUBSPAN]][%[[INDEX]]]
// -----
@@ -371,13 +371,13 @@
}
// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 128)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2] -> (s0 * 128 + s1 + s2 floordiv 4)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2] -> (s1 * 128 + s2 + s0 floordiv 4)>
//CHECK-LABEL: func.func @expand_shape2
// CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index)
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]]
// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[OFFSET]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]]]
// CHECK: memref.load %[[SUBSPAN]][%[[INDEX]]]
// -----
@@ -413,13 +413,13 @@
}
// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 4096)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2] -> (s0 * 128 + s1 + s2 floordiv 4)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2] -> (s1 * 128 + s2 + s0 floordiv 4)>
//CHECK-LABEL: func.func @subview
// CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index)
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]]
// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[OFFSET]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]]]
// CHECK: memref.load %[[SUBSPAN]][%[[INDEX]]]
// -----
@@ -462,13 +462,13 @@
}
// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 floordiv 2 + 1024)>
-// CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s0 * 32 + s1 + s2 floordiv 2)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s1 * 32 + s2 + s0 floordiv 2)>
// CHECK-LABEL: func.func @subgroup_mma_load_with_offset
// CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index)
// CHECK-DAG: %[[ZERO:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]]]
// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[ZERO]]) : memref<?xf16, 3>{%[[SIZE]]}
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP2]]()[%[[I0]], %[[I1]], %[[OFFSET]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP2]]()[%[[OFFSET]], %[[I0]], %[[I1]]]
// CHECK: %[[LD:.+]] = gpu.subgroup_mma_load_matrix %[[SUBSPAN]][%[[INDEX]]] {leadDimension = 32 : index}
// CHECK: return %[[LD]]
@@ -481,13 +481,13 @@
}
// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 floordiv 2 + 1024)>
-// CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s0 * 32 + s1 + s2 floordiv 2)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s1 * 32 + s2 + s0 floordiv 2)>
// CHECK-LABEL: func.func @subgroup_mma_store_with_offset
// CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[VAL:.+]]: !gpu.mma_matrix<16x16xf16, "COp">
// CHECK-DAG: %[[ZERO:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]]]
// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[ZERO]]) : memref<?xf16, 3>{%[[SIZE]]}
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP2]]()[%[[I0]], %[[I1]], %[[OFFSET]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP2]]()[%[[OFFSET]], %[[I0]], %[[I1]]]
// CHECK: gpu.subgroup_mma_store_matrix %[[VAL]], %[[SUBSPAN]][%[[INDEX]]] {leadDimension = 128 : index}
// -----
@@ -498,12 +498,12 @@
return %val: i32
}
-// CHECK: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 12 + s1 * 4 + s2 + s3 floordiv 4)>
+// CHECK: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 12 + s2 * 4 + s3 + s0 floordiv 4)>
// CHECK-LABEL: func.func @load_uniform_buffer
// CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index)
// CHECK: %[[C0:.+]] = arith.constant 0 : index
// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) offset(%[[C0]]) : memref<?xi32, #hal.descriptor_type<uniform_buffer>>
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[I2]], %[[OFFSET]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]]
// CHECK: %[[LD:.+]] = memref.load %[[SUBSPAN]][%[[INDEX]]] : memref<?xi32, #hal.descriptor_type<uniform_buffer>>
// CHECK: return %[[LD]] : i32
@@ -517,11 +517,11 @@
}
// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 24)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 12 + s1 * 4 + s2 + s3 floordiv 4)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 12 + s2 * 4 + s3 + s0 floordiv 4)>
// CHECK-LABEL: func.func @store_uniform_buffer
// CHECK-SAME: (%[[VAL:.+]]: i32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index)
// CHECK: %[[C0:.+]] = arith.constant 0 : index
// CHECK: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]]
// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) offset(%[[C0]]) : memref<?xi32, #hal.descriptor_type<uniform_buffer>>{%[[SIZE]]}
-// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[I2]], %[[OFFSET]]]
+// CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]]
// CHECK: memref.store %[[VAL]], %[[SUBSPAN]][%[[INDEX]]] : memref<?xi32, #hal.descriptor_type<uniform_buffer>>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
index fbd1f2d..fd029f3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
@@ -48,6 +48,8 @@
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Dialect/Tosa/IR/TosaOps.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/BuiltinTypes.h"
@@ -821,12 +823,18 @@
RewritePatternSet patterns(&getContext());
vector::populateVectorToVectorCanonicalizationPatterns(patterns);
vector::populateVectorBroadcastLoweringPatterns(patterns);
- vector::populateVectorContractLoweringPatterns(patterns);
+ // TODO: doubtful that the "default" does what one want here, it is likely
+ // better to use outerproduct.
+ vector::populateVectorContractLoweringPatterns(
+ patterns, vector::VectorTransformsOptions());
vector::populateVectorMaskMaterializationPatterns(
patterns, /*force32BitVectorIndices=*/false);
vector::populateVectorMaskOpLoweringPatterns(patterns);
vector::populateVectorShapeCastLoweringPatterns(patterns);
- vector::populateVectorTransposeLoweringPatterns(patterns);
+ // TODO: doubtful that the "default" does what one want here, it is likely
+ // better to use shuffle.
+ vector::populateVectorTransposeLoweringPatterns(
+ patterns, vector::VectorTransformsOptions());
populateConvertArmNeon2dToIntrPatterns(patterns);
if (failed(applyPatternsAndFoldGreedily(getOperation(),
std::move(patterns)))) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUMmt4dVectorLowering.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUMmt4dVectorLowering.cpp
index 3cfbef6..ac2d324 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUMmt4dVectorLowering.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUMmt4dVectorLowering.cpp
@@ -10,6 +10,8 @@
#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -134,10 +136,9 @@
vector::VectorTransformsOptions().setVectorTransformsOptions(
vector::VectorContractLowering::OuterProduct);
RewritePatternSet vectorContractLoweringPatterns(&getContext());
- vectorContractLoweringPatterns.insert<
- vector::ContractionOpToOuterProductOpLowering,
- vector::ContractionOpToMatmulOpLowering, vector::ContractionOpLowering>(
- vectorTransformsOptions, context);
+ vector::populateVectorContractLoweringPatterns(
+ vectorContractLoweringPatterns, vectorTransformsOptions, /*benefit=*/1,
+ /*disableOuterProductLowering=*/true);
vector::populateVectorTransferPermutationMapLoweringPatterns(
vectorContractLoweringPatterns);
if (failed(applyPatternsAndFoldGreedily(
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
index e4921de..8ae73b2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
@@ -27,6 +27,7 @@
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/Passes.h"
@@ -112,7 +113,10 @@
vector::VectorContractLowering::OuterProduct));
vector::populateVectorMaskOpLoweringPatterns(patterns);
vector::populateVectorShapeCastLoweringPatterns(patterns);
- vector::populateVectorTransposeLoweringPatterns(patterns);
+ // TODO: doubtful that the "default" does what one want here, it is likely
+ // better to use something else.
+ vector::populateVectorTransposeLoweringPatterns(
+ patterns, vector::VectorTransformsOptions());
vector::populateVectorTransferLoweringPatterns(patterns);
if (failed(applyPatternsAndFoldGreedily(m, std::move(patterns)))) {
return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
index 5ffe476..abec204 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
@@ -25,6 +25,7 @@
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/Passes.h"
@@ -78,7 +79,10 @@
vector::VectorContractLowering::OuterProduct));
vector::populateVectorMaskOpLoweringPatterns(patterns);
vector::populateVectorShapeCastLoweringPatterns(patterns);
- mlir::vector::populateVectorTransposeLoweringPatterns(patterns);
+ // TODO: doubtful that the "default" does what one want here, it is likely
+ // better to use something else.
+ vector::populateVectorTransposeLoweringPatterns(
+ patterns, vector::VectorTransformsOptions());
vector::populateVectorTransferLoweringPatterns(patterns);
if (failed(applyPatternsAndFoldGreedily(m, std::move(patterns)))) {
return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
index edfff5e..461cb2a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
@@ -16,6 +16,7 @@
#include "mlir/Conversion/VectorToGPU/VectorToGPU.h"
#include "mlir/Dialect/NVGPU/Utils/MMAUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/Passes.h"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp
index 9800cd8..615f3c6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp
@@ -8,6 +8,7 @@
#include "iree/compiler/Codegen/Passes.h"
#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/Passes.h"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
index f26fc59..295c97d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -28,14 +28,9 @@
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Region.h"
-#include "mlir/IR/Visitors.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
using llvm::dbgs;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir
index 24ffbf1..1883352 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir
@@ -42,20 +42,20 @@
// CHECK: %[[D5:.*]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<4096x4096xf32>
// CHECK: memref.assume_alignment %[[D5]], 64 : memref<4096x4096xf32>
// CHECK: gpu.barrier
-// CHECK: %[[D6:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]], %{{.*}}]
-// CHECK: %[[D7:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-// CHECK: %[[D8:.*]] = vector.transfer_read %[[D4]]{{\[}}%[[D6]], %[[D7]]], %[[CST]] {in_bounds = [true, true]} : memref<4096x4096xf32>, vector<1x4xf32>
-// CHECK: %[[D9:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]]]
-// CHECK: %[[D10:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-// CHECK: vector.transfer_write %[[D8]], %[[D3]]{{\[}}%[[D9]], %[[D10]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<32x33xf32, #gpu.address_space<workgroup>>
+// CHECK: %[[D6:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]]
+// CHECK: %[[D7:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+// CHECK: %[[D8:.*]] = vector.transfer_read %[[D4]][%[[D6]], %[[D7]]], %[[CST]] {in_bounds = [true, true]} : memref<4096x4096xf32>, vector<1x4xf32>
+// CHECK: %[[D9:.*]] = affine.apply #{{.*}}()[%[[D0]], %[[D1]], %[[D2]]]
+// CHECK: %[[D10:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+// CHECK: vector.transfer_write %[[D8]], %[[D3]][%[[D9]], %[[D10]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<32x33xf32, #gpu.address_space<workgroup>>
// CHECK: gpu.barrier
-// CHECK: %[[D11:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-// CHECK: %[[D12:.*]] = vector.transfer_read %[[D3]]{{\[}}%[[D11]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+// CHECK: %[[D11:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+// CHECK: %[[D12:.*]] = vector.transfer_read %[[D3]][%[[D11]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
// CHECK: %[[D13:.*]] = vector.shape_cast %[[D12]] : vector<4x1xf32> to vector<1x4xf32>
// CHECK: %[[D14:.*]] = vector.extract %[[D13]][0] : vector<1x4xf32>
-// CHECK: %[[D15:.*]] = affine.apply #{{.*}}(){{\[}}%{{.*}}, %[[D1]]]
-// CHECK: %[[D16:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-// CHECK: vector.transfer_write %[[D14]], %[[D5]]{{\[}}%[[D15]], %[[D16]]] {in_bounds = [true]} : vector<4xf32>, memref<4096x4096xf32>
+// CHECK: %[[D15:.*]] = affine.apply #{{.*}}()[%[[D1]], %{{.*}}]
+// CHECK: %[[D16:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+// CHECK: vector.transfer_write %[[D14]], %[[D5]][%[[D15]], %[[D16]]] {in_bounds = [true]} : vector<4xf32>, memref<4096x4096xf32>
// -----
@@ -107,22 +107,22 @@
// CHECK: %[[D6:.*]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<768x2048xf32>
// CHECK: memref.assume_alignment %[[D6]], 64 : memref<768x2048xf32>
// CHECK: gpu.barrier
-// CHECK: %[[D7:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]], %{{.*}}]
-// CHECK: %[[D8:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-// CHECK: %[[D9:.*]] = vector.transfer_read %[[D4]]{{\[}}%[[D7]], %[[D8]]], %[[CST]] {in_bounds = [true, true]} : memref<2048x768xf32>, vector<1x4xf32>
-// CHECK: %[[D10:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]]]
-// CHECK: %[[D11:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-// CHECK: vector.transfer_write %[[D9]], %[[D3]]{{\[}}%[[D10]], %[[D11]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<32x33xf32, #gpu.address_space<workgroup>>
+// CHECK: %[[D7:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]]
+// CHECK: %[[D8:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+// CHECK: %[[D9:.*]] = vector.transfer_read %[[D4]][%[[D7]], %[[D8]]], %[[CST]] {in_bounds = [true, true]} : memref<2048x768xf32>, vector<1x4xf32>
+// CHECK: %[[D10:.*]] = affine.apply #{{.*}}()[%[[D0]], %[[D1]], %[[D2]]]
+// CHECK: %[[D11:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+// CHECK: vector.transfer_write %[[D9]], %[[D3]][%[[D10]], %[[D11]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<32x33xf32, #gpu.address_space<workgroup>>
// CHECK: gpu.barrier
-// CHECK: %[[D12:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-// CHECK: %[[D13:.*]] = vector.transfer_read %[[D3]]{{\[}}%[[D12]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+// CHECK: %[[D12:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+// CHECK: %[[D13:.*]] = vector.transfer_read %[[D3]][%[[D12]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
// CHECK: %[[D14:.*]] = vector.shape_cast %[[D13]] : vector<4x1xf32> to vector<1x4xf32>
-// CHECK: %[[D15:.*]] = affine.apply #{{.*}}(){{\[}}%{{.*}}, %[[D1]]]
-// CHECK: %[[D16:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-// CHECK: %[[D17:.*]] = vector.transfer_read %[[D5]]{{\[}}%[[D15]], %[[D16]]], %[[CST]] {in_bounds = [true]} : memref<768x2048xf32>, vector<4xf32>
+// CHECK: %[[D15:.*]] = affine.apply #{{.*}}()[%[[D1]], %{{.*}}]
+// CHECK: %[[D16:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+// CHECK: %[[D17:.*]] = vector.transfer_read %[[D5]][%[[D15]], %[[D16]]], %[[CST]] {in_bounds = [true]} : memref<768x2048xf32>, vector<4xf32>
// CHECK: %[[D18:.*]] = vector.extract %[[D14]][0] : vector<1x4xf32>
// CHECK: %[[D19:.*]] = arith.addf %[[D18]], %[[D17]] : vector<4xf32>
-// CHECK: vector.transfer_write %[[D19]], %[[D6]]{{\[}}%[[D15]], %[[D16]]] {in_bounds = [true]} : vector<4xf32>, memref<768x2048xf32>
+// CHECK: vector.transfer_write %[[D19]], %[[D6]][%[[D15]], %[[D16]]] {in_bounds = [true]} : vector<4xf32>, memref<768x2048xf32>
// -----
@@ -213,23 +213,23 @@
// CHECK: %[[D6:.*]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32>
// CHECK: memref.assume_alignment %[[D6]], 64 : memref<10x768x2048xf32>
// CHECK: gpu.barrier
-// CHECK: %[[D7:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]], %{{.*}}]
-// CHECK: %[[D8:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-// CHECK: %[[D9:.*]] = vector.transfer_read %[[D4]]{{\[}}%{{.*}}, %[[D7]], %[[D8]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x2048x768xf32>, vector<1x1x4xf32>
-// CHECK: %[[D10:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]]]
-// CHECK: %[[D11:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-// CHECK: vector.transfer_write %[[D9]], %[[D3]]{{\[}}%[[C0]], %[[D10]], %[[D11]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
+// CHECK: %[[D7:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]]
+// CHECK: %[[D8:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+// CHECK: %[[D9:.*]] = vector.transfer_read %[[D4]][%{{.*}}, %[[D7]], %[[D8]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x2048x768xf32>, vector<1x1x4xf32>
+// CHECK: %[[D10:.*]] = affine.apply #{{.*}}()[%[[D0]], %[[D1]], %[[D2]]]
+// CHECK: %[[D11:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+// CHECK: vector.transfer_write %[[D9]], %[[D3]][%[[C0]], %[[D10]], %[[D11]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
// CHECK: gpu.barrier
-// CHECK: %[[D12:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-// CHECK: %[[D13:.*]] = vector.transfer_read %[[D3]]{{\[}}%[[C0]], %[[D12]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+// CHECK: %[[D12:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+// CHECK: %[[D13:.*]] = vector.transfer_read %[[D3]][%[[C0]], %[[D12]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
// CHECK: %[[D14:.*]] = vector.broadcast %[[D13]] : vector<4x1xf32> to vector<1x4x1xf32>
// CHECK: %[[D15:.*]] = vector.shape_cast %[[D14]] : vector<1x4x1xf32> to vector<1x1x4xf32>
-// CHECK: %[[D16:.*]] = affine.apply #{{.*}}(){{\[}}%{{.*}}, %[[D1]]]
-// CHECK: %[[D17:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-// CHECK: %[[D18:.*]] = vector.transfer_read %[[D5]]{{\[}}%{{.*}}, %[[D16]], %[[D17]]], %[[CST]] {in_bounds = [true]} : memref<10x768x2048xf32>, vector<4xf32>
+// CHECK: %[[D16:.*]] = affine.apply #{{.*}}()[%[[D1]], %{{.*}}]
+// CHECK: %[[D17:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+// CHECK: %[[D18:.*]] = vector.transfer_read %[[D5]][%{{.*}}, %[[D16]], %[[D17]]], %[[CST]] {in_bounds = [true]} : memref<10x768x2048xf32>, vector<4xf32>
// CHECK: %[[D19:.*]] = vector.extract %[[D15]][0, 0] : vector<1x1x4xf32>
// CHECK: %[[D20:.*]] = arith.addf %[[D19]], %[[D18]] : vector<4xf32>
-// CHECK: vector.transfer_write %[[D20]], %[[D6]]{{\[}}%{{.*}}, %[[D16]], %[[D17]]] {in_bounds = [true]} : vector<4xf32>, memref<10x768x2048xf32>
+// CHECK: vector.transfer_write %[[D20]], %[[D6]][%{{.*}}, %[[D16]], %[[D17]]] {in_bounds = [true]} : vector<4xf32>, memref<10x768x2048xf32>
// -----
@@ -281,25 +281,25 @@
// CHECK: %[[D7:.*]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<10x2048x768xf32>
// CHECK: memref.assume_alignment %[[D7]], 64 : memref<10x2048x768xf32>
// CHECK: gpu.barrier
-// CHECK: %[[D8:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]], %{{.*}}]
-// CHECK: %[[D9:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-// CHECK: %[[D10:.*]] = vector.transfer_read %[[D5]]{{\[}}%{{.*}}, %[[D8]], %[[D9]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x768x2048xf32>, vector<1x1x4xf32>
-// CHECK: %[[D11:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]]]
-// CHECK: %[[D12:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-// CHECK: vector.transfer_write %[[D10]], %[[D4]]{{\[}}%[[C0]], %[[D11]], %[[D12]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
-// CHECK: %[[D13:.*]] = vector.transfer_read %[[D6]]{{\[}}%{{.*}}, %[[D8]], %[[D9]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x768x2048xf32>, vector<1x1x4xf32>
-// CHECK: vector.transfer_write %[[D13]], %[[D3]]{{\[}}%[[C0]], %[[D11]], %[[D12]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
+// CHECK: %[[D8:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]]
+// CHECK: %[[D9:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+// CHECK: %[[D10:.*]] = vector.transfer_read %[[D5]][%{{.*}}, %[[D8]], %[[D9]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x768x2048xf32>, vector<1x1x4xf32>
+// CHECK: %[[D11:.*]] = affine.apply #{{.*}}()[%[[D0]], %[[D1]], %[[D2]]]
+// CHECK: %[[D12:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+// CHECK: vector.transfer_write %[[D10]], %[[D4]][%[[C0]], %[[D11]], %[[D12]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
+// CHECK: %[[D13:.*]] = vector.transfer_read %[[D6]][%{{.*}}, %[[D8]], %[[D9]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x768x2048xf32>, vector<1x1x4xf32>
+// CHECK: vector.transfer_write %[[D13]], %[[D3]][%[[C0]], %[[D11]], %[[D12]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
// CHECK: gpu.barrier
-// CHECK: %[[D14:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-// CHECK: %[[D15:.*]] = vector.transfer_read %[[D4]]{{\[}}%[[C0]], %[[D14]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
-// CHECK: %[[D16:.*]] = vector.transfer_read %[[D3]]{{\[}}%[[C0]], %[[D14]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+// CHECK: %[[D14:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+// CHECK: %[[D15:.*]] = vector.transfer_read %[[D4]][%[[C0]], %[[D14]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+// CHECK: %[[D16:.*]] = vector.transfer_read %[[D3]][%[[C0]], %[[D14]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
// CHECK: %[[D17:.*]] = arith.addf %[[D15]], %[[D16]] : vector<4x1xf32>
// CHECK: %[[D18:.*]] = vector.broadcast %[[D17]] : vector<4x1xf32> to vector<1x4x1xf32>
// CHECK: %[[D19:.*]] = vector.shape_cast %[[D18]] : vector<1x4x1xf32> to vector<1x1x4xf32>
// CHECK: %[[D20:.*]] = vector.extract %[[D19]][0, 0] : vector<1x1x4xf32>
-// CHECK: %[[D21:.*]] = affine.apply #{{.*}}(){{\[}}%{{.*}}, %[[D1]]]
-// CHECK: %[[D22:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-// CHECK: vector.transfer_write %[[D20]], %[[D7]]{{\[}}%{{.*}}, %[[D21]], %[[D22]]] {in_bounds = [true]} : vector<4xf32>, memref<10x2048x768xf32>
+// CHECK: %[[D21:.*]] = affine.apply #{{.*}}()[%[[D1]], %{{.*}}]
+// CHECK: %[[D22:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+// CHECK: vector.transfer_write %[[D20]], %[[D7]][%{{.*}}, %[[D21]], %[[D22]]] {in_bounds = [true]} : vector<4xf32>, memref<10x2048x768xf32>
// -----
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp
index 70497b6..4c7755a 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp
@@ -37,6 +37,7 @@
#include "mlir/Dialect/Linalg/Utils/Utils.h"
#include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Interfaces/VectorInterfaces.h"
#include "mlir/Pass/Pass.h"
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp
index 66997da..7a16383 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp
@@ -27,6 +27,7 @@
#include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/IR/Matchers.h"
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
index bbf82d7..42c00a2 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
@@ -64,8 +64,8 @@
outs(%17 : tensor<256x1024xf16>) {
^bb0(%arg2: f16, %arg3: f16, %arg4: f16, %arg5: f16):
%28 = arith.divf %arg2, %arg3 : f16
- // spirv.GL.Exp is not permitted to use cooperative matrix types per the spec.
- %29 = math.exp %28 : f16
+ // spirv.GL.FAbs is not permitted to use cooperative matrix types per the spec.
+ %29 = math.absf %28 : f16
linalg.yield %29 : f16
} -> tensor<256x1024xf16>
flow.dispatch.tensor.store %27, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
@@ -189,19 +189,19 @@
// CHECK: spirv.Load "StorageBuffer" %{{.+}} : vector<4xf32>
// CHECK: spirv.Load "Workgroup" %{{.+}} : vector<4xf32>
// CHECK-COUNT-2: spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2: spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2: spirv.GL.FAbs %{{.+}} : vector<4xf16>
// CHECK: spirv.Load "StorageBuffer" %{{.+}} : vector<4xf32>
// CHECK: spirv.Load "Workgroup" %{{.+}} : vector<4xf32>
// CHECK-COUNT-2: spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2: spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2: spirv.GL.FAbs %{{.+}} : vector<4xf16>
// CHECK: spirv.Load "StorageBuffer" %{{.+}} : vector<4xf32>
// CHECK: spirv.Load "Workgroup" %{{.+}} : vector<4xf32>
// CHECK-COUNT-2: spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2: spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2: spirv.GL.FAbs %{{.+}} : vector<4xf16>
// CHECK: spirv.Load "StorageBuffer" %{{.+}} : vector<4xf32>
// CHECK: spirv.Load "Workgroup" %{{.+}} : vector<4xf32>
// CHECK-COUNT-2: spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2: spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2: spirv.GL.FAbs %{{.+}} : vector<4xf16>
// CHECK: spirv.ControlBarrier <Workgroup>, <Workgroup>, <AcquireRelease|WorkgroupMemory>
// -----
@@ -577,8 +577,8 @@
outs(%17 : tensor<256x1024xf16>) {
^bb0(%arg2: f16, %arg3: f16, %arg4: f16, %arg5: f16):
%28 = arith.divf %arg2, %arg3 : f16
- // spirv.GL.Exp is not permitted to use cooperative matrix types per the spec.
- %29 = math.exp %28 : f16
+ // spirv.GL.FAbs is not permitted to use cooperative matrix types per the spec.
+ %29 = math.absf %28 : f16
linalg.yield %29 : f16
} -> tensor<256x1024xf16>
flow.dispatch.tensor.store %27, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
@@ -639,13 +639,13 @@
// CHECK: spirv.ControlBarrier <Workgroup>, <Workgroup>, <AcquireRelease|WorkgroupMemory>
// CHECK-COUNT-2: spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2: spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2: spirv.GL.FAbs %{{.+}} : vector<4xf16>
// CHECK-COUNT-2: spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2: spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2: spirv.GL.FAbs %{{.+}} : vector<4xf16>
// CHECK-COUNT-2: spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2: spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2: spirv.GL.FAbs %{{.+}} : vector<4xf16>
// CHECK-COUNT-2: spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2: spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2: spirv.GL.FAbs %{{.+}} : vector<4xf16>
// CHECK: spirv.ControlBarrier <Workgroup>, <Workgroup>, <AcquireRelease|WorkgroupMemory>
// -----
diff --git a/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/CPU/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/CPU/Common.cpp
index 3689722..9fb1220 100644
--- a/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/CPU/Common.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/CPU/Common.cpp
@@ -19,6 +19,8 @@
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Transform/IR/TransformOps.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/ImplicitLocOpBuilder.h"
@@ -33,20 +35,56 @@
using iree_compiler::cpu::ReductionStrategy;
using iree_compiler::IREE::transform_dialect::ApplyPatternsOpPatterns;
using iree_compiler::IREE::transform_dialect::ForallToWorkgroupOp;
-using transform::LowerVectorsOp;
+using transform::ApplyTransferPermutationPatternsOp;
+using transform::LowerContractionOp;
+using transform::LowerMultiReductionOp;
+using transform::LowerShapeCastOp;
+using transform::LowerTransferOp;
+using transform::LowerTransposeOp;
using transform::MatchOp;
using transform::SplitHandlesOp;
+using transform::SplitTransferFullPartialOp;
+using transform::TransferToScfOp;
using transform_ext::AllDims;
using transform_ext::m_StructuredOp;
using transform_ext::NumEqualsTo;
using transform_ext::RegisterMatchCallbacksOp;
using transform_ext::ShapeKind;
using transform_ext::StructuredOpMatcher;
+using vector::VectorContractLoweringAttr;
//===----------------------------------------------------------------------===//
// Mid-level problem-specific strategy builder APIs, follow MLIR-style builders.
//===----------------------------------------------------------------------===//
+// TODO: better builders.
+static Value buildDefaultVectorLoweringStrategy(
+ ImplicitLocOpBuilder &b, Value funcH,
+ const vector::LowerVectorsOptions &lowerVectorsOpts) {
+ auto pdlOperation = pdl::OperationType::get(b.getContext());
+ funcH = b.create<LowerContractionOp>(
+ pdlOperation, funcH,
+ /*loweringStrategy*/ lowerVectorsOpts.vectorContractLowering);
+ funcH = b.create<ApplyTransferPermutationPatternsOp>(pdlOperation, funcH);
+ funcH = b.create<LowerMultiReductionOp>(
+ pdlOperation, funcH,
+ /*loweringStrategy=*/lowerVectorsOpts.vectorMultiReductionLowering);
+ funcH = b.create<SplitTransferFullPartialOp>(
+ pdlOperation, funcH,
+ /*splitTransferStrategy=*/lowerVectorsOpts.vectorTransferSplit);
+ funcH = b.create<TransferToScfOp>(
+ pdlOperation, funcH,
+ /*maxTransferRank=*/1,
+ /*fullUnroll=*/lowerVectorsOpts.unrollVectorTransfers);
+ funcH = b.create<LowerTransferOp>(pdlOperation, funcH, /*maxTransferRank=*/1);
+ funcH = b.create<LowerShapeCastOp>(pdlOperation, funcH);
+ funcH = b.create<LowerTransposeOp>(
+ pdlOperation, funcH,
+ /*loweringStrategy=*/lowerVectorsOpts.vectorTransposeLowering,
+ /*avx2LoweringStrategy=*/lowerVectorsOpts.transposeAVX2Lowering);
+ return funcH;
+}
+
/// Take care of the last common steps in a CPU strategy (i.e. vectorize,
/// bufferize and map to blocks).
/// Return the handles to the updated variant and the func::FuncOp ops under
@@ -81,9 +119,7 @@
b.create<ForallToWorkgroupOp>(funcH);
// Step N. Lower vectors.
- // TODO: Control the lowering to vectors.
- auto pdlOperation = pdl::OperationType::get(b.getContext());
- funcH = b.create<LowerVectorsOp>(pdlOperation, funcH, lowerVectorsOpts);
+ funcH = buildDefaultVectorLoweringStrategy(b, funcH, lowerVectorsOpts);
return std::make_pair(variantH, funcH);
}
diff --git a/integrations/tensorflow/WORKSPACE b/integrations/tensorflow/WORKSPACE
index f97b607..6dd236e 100644
--- a/integrations/tensorflow/WORKSPACE
+++ b/integrations/tensorflow/WORKSPACE
@@ -7,7 +7,7 @@
load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
-TENSORFLOW_COMMIT = "9104c104fe2422b1034be276f94f9b53b61751eb"
+TENSORFLOW_COMMIT = "0a8fd11e73559d9bf0e547343f08f648981e2ff7"
git_repository(
name = "org_tensorflow",
diff --git a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/Transforms/Transforms.cpp b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/Transforms/Transforms.cpp
index c348284..33a8682 100644
--- a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/Transforms/Transforms.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/Transforms/Transforms.cpp
@@ -22,6 +22,7 @@
#include "mlir/Dialect/Tensor/Utils/Utils.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Dialect/Vector/Transforms/Passes.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/Pass/PassManager.h"
@@ -259,10 +260,10 @@
vector::populateVectorToVectorCanonicalizationPatterns(patterns);
// In a progressive lowering of vectors, this would be the 1st step.
if (options.contractionLowering) {
- patterns.add<vector::ContractionOpToOuterProductOpLowering,
- vector::ContractionOpToMatmulOpLowering,
- vector::ContractionOpLowering>(
- options.vectorTransformOptions, context);
+ vector::populateVectorContractLoweringPatterns(
+ patterns, options.vectorTransformOptions,
+ /*benefit=*/1,
+ /*disableOuterProductLowering=*/true);
vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
}
// In a progressive lowering of vectors, this would be the 2nd step.
@@ -273,8 +274,8 @@
}
// In a progressive lowering of vectors, this would be the 3rd step.
if (options.transferPartialRewrite) {
- patterns.add<vector::VectorTransferFullPartialRewriter>(
- context, options.vectorTransformOptions);
+ populateVectorTransferFullPartialPatterns(patterns,
+ options.vectorTransformOptions);
}
// In a progressive lowering of vectors, this would be the 4th step.
if (options.transferLowering) {
diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir
index 3926f9c..17a3e45 100644
--- a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir
@@ -20,9 +20,11 @@
// CHECK: transform.structured.vectorize %[[OPS2]]
transform.structured.vectorize %5
// CHECK: %[[FUNC:.*]] = transform.structured.match ops{["func.func"]} in %arg0
- // CHECK: lower_vectors %[[FUNC]] {{.*}} multireduction_lowering = innerreduction
+ // CHECK: vector.lower_contraction %[[FUNC]] {{.*}}
%6 = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation
- transform.vector.lower_vectors %6 multireduction_lowering = "innerreduction"
+ transform.vector.lower_contraction %6
+ lowering_strategy = "outerproduct"
+ : (!pdl.operation) -> !pdl.operation
// CHECK: lower_to_llvm
lower_to_llvm
}
diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir
index f5b7a52..888a4c0 100644
--- a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir
@@ -25,9 +25,16 @@
{bufferize_function_boundaries = true}
%3 = transform.structured.match ops{["func.func"]} in %module_op
: (!pdl.operation) -> !pdl.operation
- transform.vector.lower_vectors %3 multireduction_lowering = "innerreduction"
- // lower_to_llvm is the only remaining op not upstreamed, at the same time we
- // upstreamed --test-lower-to-llvm.
+
+ %func = transform.structured.match ops{["func.func"]} in %module_op
+ : (!pdl.operation) -> !pdl.operation
+ %func_e_2 = transform.vector.lower_contraction %func
+ lowering_strategy = "outerproduct"
+ : (!pdl.operation) -> !pdl.operation
+ %func_e_3 = transform.vector.lower_transpose %func_e_2
+ lowering_strategy = "shuffle"
+ : (!pdl.operation) -> !pdl.operation
+
lower_to_llvm
}
diff --git a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
index 6e08a3f..139417c 100644
--- a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
+++ b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
@@ -64,6 +64,7 @@
: (!pdl.operation) -> (!pdl.operation)
transform.structured.pack_greedily %matmul
- gemm_packed_sizes = [8, 16, 32] gemm_inner_dims_order = [0, 1, 2]
+ matmul_packed_sizes = [8, 16, 32]
+ matmul_inner_dims_order = [0, 1, 2]
: (!pdl.operation) -> !transform.op<"linalg.generic">
}
diff --git a/tests/transform_dialect/cpu/contraction-packing.mlir b/tests/transform_dialect/cpu/contraction-packing.mlir
index 828f2c6..8848abe 100644
--- a/tests/transform_dialect/cpu/contraction-packing.mlir
+++ b/tests/transform_dialect/cpu/contraction-packing.mlir
@@ -146,6 +146,6 @@
// gemm (i.e. 3-D contraction with (m,n,k)=(8,16,32) ) on the 3 most minor
// dimensions.
transform.structured.pack_greedily %matmul
- gemm_packed_sizes = [8, 16, 32] gemm_inner_dims_order = [0, 1, 2]
+ matmul_packed_sizes = [8, 16, 32] matmul_inner_dims_order = [0, 1, 2]
: (!pdl.operation) -> !transform.op<"linalg.generic">
}
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 18f5bc5..8759c3b 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 18f5bc58159389fb1aaaa50577be4ff5397a65ae
+Subproject commit 8759c3bea5defd7c9c382904c1809ed0aa800eef
diff --git a/third_party/mlir-hlo b/third_party/mlir-hlo
index ab58786..29e975a 160000
--- a/third_party/mlir-hlo
+++ b/third_party/mlir-hlo
@@ -1 +1 @@
-Subproject commit ab587867e4aa4307c4b573d7926c3f869563eaa2
+Subproject commit 29e975a3c0a469c41fc564c36766217aef9d07a4