Move Codegen pass pipelines to nest on `FunctionOpInterface`. (#16665)
This PR modifies the codegen backends to have the lowering pass
pipelines nest on `FunctionOpInterface`. This allows running different
pass pipelines on functions within the dispatch. This would allow you
to have things like
```
func.func @foo_pipeline(...) {
}
func.func @bar_pipeline(...) {
}
func.func @entry_point() {
if (<condnn for foo pipeline based lowering>) {
foo_pipeline()
} else {
bar_pipeline()
}
}
```
To connect everything the following things are done
1) The `iree_codegen.translation_info` attribute that was set on entry
point operations are now set on the surrounding function. This
allows implementing a lowering strategy on a function.
2) The GPU backends set the `workgroup_size` and `subgroup_size` on
the `hal.executable.export` operation. To unwind this, the
`translation_info` has fields for `workgroup_size` and
`subgroup_size`. This allows GPU backends to set the expected
`workgroup_size` and `subgroup_size` on the `translation_info`
itself (which is now on the surrounding function).
3) A pass is added after lower strategies to
`ReconcileTranslationInfo`. The intent of this pass is to take the
`translation_info` on each function and set the values for
`workgroup_size` and `subgroup_size` on the
`hal.executable.export`. Eventually this would also be a place
where the number of workgroups is populated on the
`hal.executable.export` (instead of doing it on
`TileAndDistributeToWorkgroups` as it is done today).
4) All backends `*SelectLoweringStrategy` work as Module pass. These
need to be Module passes since transform dialect tends to inject
the transform script within the module.
5) The `*LowerExecutableStrategy` works at `FunctionOpInterface` now.
6) The transform dialect interpreter has to run on `Module`
granularity, so a new pass `LowerExecutableUsingTransformDialect`
is added. This runs the transform interpreter before
`*SelectLoweringStrategy`. After this pass is run, the
`translation_info` is expected to have the pipeline be set to
`None` to skip subsequent lowering pipelines.
7) Most tests are now moved to remove the boiler plates surrounding
`hal.executable` and `hal.executable.variant`.
This does most of the heavy lifting for running lowering strategies
per function-like op. The biggest missing piece are
1) The `TileAndDistributeOnWorkgroups` ops still cannot really be run
on a dispatch with multiple functions since it updates the
`hal.executable.export`. To address this, the pass will have to
move to use `scf.forall`.
2) Some optimizations expect static workgroup count. Those currently
go upto the `hal.executable.export` op to get these values (that
were populated by `TileAndDistributeToWorkgroups`). When moving to
`scf.forall` this will be available withint the function.
ci-extra: build_test_all_arm64, build_test_all_windows,
build_test_all_maxos_arm64, build_test_all_macos_x86_64,
test_nvidia_a100
diff --git a/compiler/plugins/target/ROCM/ROCMTarget.cpp b/compiler/plugins/target/ROCM/ROCMTarget.cpp
index 0cb90a5..0494b3a 100644
--- a/compiler/plugins/target/ROCM/ROCMTarget.cpp
+++ b/compiler/plugins/target/ROCM/ROCMTarget.cpp
@@ -305,7 +305,7 @@
llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOpMap;
std::vector<std::array<int32_t, 3>> workgroupSizes;
SmallVector<uint32_t> workgroupLocalMemories;
- int32_t subgroupSize = 64;
+ uint32_t subgroupSize = 64;
for (auto exportOp : variantOp.getExportOps()) {
exportOps.push_back(exportOp);
exportOpMap[exportOp.getSymName()] = exportOp;
@@ -320,12 +320,12 @@
}
workgroupSizes.push_back(workgroupSize);
- if (auto setSubgroupSize = getSubgroupSize(exportOp)) {
- if (subgroupSize != 32 && subgroupSize != 64) {
+ if (auto setSubgroupSize = exportOp.getSubgroupSizeAsUInt()) {
+ if (setSubgroupSize.value() != 32 && setSubgroupSize.value() != 64) {
return variantOp.emitError()
- << "invalid subgroup size " << subgroupSize;
+ << "invalid subgroup size " << setSubgroupSize.value();
}
- subgroupSize = *setSubgroupSize;
+ subgroupSize = setSubgroupSize.value();
}
uint32_t workgroupLocalMemory = 0;
@@ -385,13 +385,8 @@
// cases where codegen decides to override the value.
// Otherwise, fallback to the default option.
int64_t wavesPerEu = 0;
- IREE::Codegen::TranslationInfoAttr translationInfo =
- getTranslationInfo(exportOp);
- if (auto translationConfig = translationInfo.getConfiguration()) {
- if (auto attr =
- translationConfig.getAs<IntegerAttr>("waves_per_eu")) {
- wavesPerEu = attr.getValue().getSExtValue();
- }
+ if (auto attr = func->getAttrOfType<IntegerAttr>("waves_per_eu")) {
+ wavesPerEu = attr.getValue().getSExtValue();
}
if (wavesPerEu == 0) {
if (auto attr = getConfigIntegerAttr(targetAttr, "waves_per_eu"))
diff --git a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
index 8d651b6..0a7c4ba 100644
--- a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
@@ -100,6 +100,7 @@
"IREEComprehensiveBufferizePass.cpp",
"IREEExpandStridedMetadata.cpp",
"InstrumentMemoryAccesses.cpp",
+ "LowerExecutableUsingTransformDialect.cpp",
"LowerUKernelsToCalls.cpp",
"MaterializeEncodingIntoNop.cpp",
"MaterializeEncodingIntoPackUnPack.cpp",
@@ -107,8 +108,10 @@
"OptimizeTensorInsertExtractSlices.cpp",
"OptimizeVectorTransferPass.cpp",
"PadDynamicAlloc.cpp",
+ "PassUtils.cpp",
"Passes.cpp",
"PolynomialApproximationPass.cpp",
+ "ReconcileTranslationInfo.cpp",
"RematerializeParallelOps.cpp",
"RemoveTrivialLoops.cpp",
"ReplaceSlowMinMaxOps.cpp",
@@ -127,6 +130,7 @@
"BufferizationAnalysis.h",
"EncodingUtils.h",
"ExtractAddressComputation.h",
+ "PassUtils.h",
"Passes.h",
"TileSizeSelection.h",
"Transforms.h",
@@ -149,6 +153,7 @@
"//compiler/src/iree/compiler/Dialect/LinalgExt/Transforms",
"//compiler/src/iree/compiler/Dialect/Util/IR",
"//compiler/src/iree/compiler/Utils",
+ "//llvm-external-projects/iree-dialects:IREELinalgTransformDialect",
"//llvm-external-projects/iree-dialects:IREEVectorExtDialect",
"@llvm-project//llvm:Support",
"@llvm-project//mlir:AffineAnalysis",
diff --git a/compiler/src/iree/compiler/Codegen/Common/BufferizeCopyOnlyDispatchesPass.cpp b/compiler/src/iree/compiler/Codegen/Common/BufferizeCopyOnlyDispatchesPass.cpp
index 86a8c23..f7dc6b5 100644
--- a/compiler/src/iree/compiler/Codegen/Common/BufferizeCopyOnlyDispatchesPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/BufferizeCopyOnlyDispatchesPass.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "iree/compiler/Codegen/Common/PassDetail.h"
+#include "iree/compiler/Codegen/Common/PassUtils.h"
#include "iree/compiler/Codegen/Common/Passes.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "iree/compiler/Dialect/Flow/IR/FlowDialect.h"
@@ -50,45 +51,33 @@
} // namespace
void BufferizeCopyOnlyDispatchesPass::runOnOperation() {
- ModuleOp module = getOperation();
+ auto funcOp = getOperation();
- SmallVector<Operation *> copyOnlyFunctions;
- auto funcOps = module.getOps<mlir::FunctionOpInterface>();
- for (auto funcOp : funcOps) {
- /// Check if the dispatch has all sources for `flow.dispatch.tensor.store`
- /// operations coming from `flow.dispatch.tensor.load` operations. If so,
- /// this dispatch is just a copy dispatch.
- bool hasFlowDispatchStore = false;
- auto walkResult = funcOp.walk(
- [&](IREE::Flow::DispatchTensorStoreOp storeOp) -> WalkResult {
- hasFlowDispatchStore = true;
- return success(isReadOnly(storeOp.getValue()));
- });
- if (walkResult.wasInterrupted())
- continue;
- // The function is just a copy and is not yet bufferized.
- if (hasFlowDispatchStore)
- copyOnlyFunctions.push_back(funcOp);
- }
-
- // There are no copy-only functions. So nothing to do.
- if (copyOnlyFunctions.empty())
+ /// Check if the dispatch has all sources for `flow.dispatch.tensor.store`
+ /// operations coming from `flow.dispatch.tensor.load` operations. If so,
+ /// this dispatch is just a copy dispatch.
+ bool hasFlowDispatchStore = false;
+ auto walkResult =
+ funcOp.walk([&](IREE::Flow::DispatchTensorStoreOp storeOp) -> WalkResult {
+ hasFlowDispatchStore = true;
+ return success(isReadOnly(storeOp.getValue()));
+ });
+ if (walkResult.wasInterrupted())
+ return;
+ // The function is just a copy and is not yet bufferized.
+ if (!hasFlowDispatchStore)
return;
- // Bufferize the dispatch to create a `linalg.generic` as a copy operation.
- // This can then be used by the backends to tile and distribute.
- // Currently bufferization does not handle single function bufferization. So
- // check that all functions are copy only and can be bufferized.
- if (copyOnlyFunctions.size() !=
- std::distance(funcOps.begin(), funcOps.end())) {
- module.emitOpError(
- "module contains functions that are both copy only and not copy only. "
- "This is currently unhandled.");
+ // Apply the bufferization passes.
+ std::optional<OpPassManager> maybeBufferizationPipeline =
+ getFunctionOpInterfacePassManager(funcOp);
+ if (!maybeBufferizationPipeline) {
+ funcOp.emitOpError("unhandled operation type while creating pass pipeline "
+ "nested on `FunctionOpInterface`");
return signalPassFailure();
}
+ OpPassManager &bufferizationPipeline = maybeBufferizationPipeline.value();
- // Apply the bufferization passes.
- OpPassManager bufferizationPipeline(module.getOperationName());
// The copy-only dispatch shouldnt need an allocation. Error out on
// allocation.
bufferization::BufferizationOptions::AllocationFn allocationFn =
@@ -106,21 +95,21 @@
addIREEComprehensiveBufferizePasses(bufferizationPipeline, allocationFn,
memcpyFn);
- if (failed(runPipeline(bufferizationPipeline, module))) {
+ if (failed(runPipeline(bufferizationPipeline, funcOp))) {
return signalPassFailure();
}
// Check that there are no allocs created.
- auto hasAlloc = module.walk(
+ auto hasAlloc = funcOp.walk(
[&](memref::AllocOp /*op*/) -> WalkResult { return failure(); });
if (hasAlloc.wasInterrupted()) {
- module.emitOpError(
+ funcOp.emitOpError(
"unexpected allocations while bufferizing copy dispatch");
return signalPassFailure();
}
}
-std::unique_ptr<OperationPass<ModuleOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createBufferizeCopyOnlyDispatchesPass() {
return std::make_unique<BufferizeCopyOnlyDispatchesPass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
index c91f42b..9a7cc2b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
@@ -55,6 +55,7 @@
"BufferizationAnalysis.h"
"EncodingUtils.h"
"ExtractAddressComputation.h"
+ "PassUtils.h"
"Passes.h"
"TileSizeSelection.h"
"Transforms.h"
@@ -93,6 +94,7 @@
"IREEComprehensiveBufferizePass.cpp"
"IREEExpandStridedMetadata.cpp"
"InstrumentMemoryAccesses.cpp"
+ "LowerExecutableUsingTransformDialect.cpp"
"LowerUKernelsToCalls.cpp"
"MaterializeEncodingIntoNop.cpp"
"MaterializeEncodingIntoPackUnPack.cpp"
@@ -100,8 +102,10 @@
"OptimizeTensorInsertExtractSlices.cpp"
"OptimizeVectorTransferPass.cpp"
"PadDynamicAlloc.cpp"
+ "PassUtils.cpp"
"Passes.cpp"
"PolynomialApproximationPass.cpp"
+ "ReconcileTranslationInfo.cpp"
"RematerializeParallelOps.cpp"
"RemoveTrivialLoops.cpp"
"ReplaceSlowMinMaxOps.cpp"
@@ -118,6 +122,7 @@
DEPS
::PassHeaders
::PassesIncGen
+ IREELinalgTransformDialect
IREEVectorExtDialect
LLVMSupport
MLIRAffineAnalysis
diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/Common/CPU/Passes.cpp
index e47f88a..bd4e097 100644
--- a/compiler/src/iree/compiler/Codegen/Common/CPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/CPU/Passes.cpp
@@ -44,10 +44,10 @@
return success();
}
-void addCPUBufferizePasses(OpPassManager &passManager) {
+void addCPUBufferizePasses(OpPassManager &funcPassManager) {
BufferizationOptions::AllocationFn allocationFn = cpuAllocationFn;
BufferizationOptions::MemCpyFn memcpyFn = cpuCopyFn;
- addIREEComprehensiveBufferizePasses(passManager, allocationFn, memcpyFn);
+ addIREEComprehensiveBufferizePasses(funcPassManager, allocationFn, memcpyFn);
}
//===---------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/Passes.h b/compiler/src/iree/compiler/Codegen/Common/CPU/Passes.h
index f1e1975..8460000 100644
--- a/compiler/src/iree/compiler/Codegen/Common/CPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Common/CPU/Passes.h
@@ -46,7 +46,7 @@
ArrayRef<IREE::HAL::ExecutableTargetAttr> targetAttrs = {});
/// Adds CPU bufferization passes to the pipeline.
-void addCPUBufferizePasses(OpPassManager &passManager);
+void addCPUBufferizePasses(OpPassManager &funcPassManager);
/// Pass to lower a sequence of operations to a iree_codegen.ukernel.*
/// operation.
diff --git a/compiler/src/iree/compiler/Codegen/Common/ConvertBf16ArithToF32.cpp b/compiler/src/iree/compiler/Codegen/Common/ConvertBf16ArithToF32.cpp
index 43de5b2..b4f43bb 100644
--- a/compiler/src/iree/compiler/Codegen/Common/ConvertBf16ArithToF32.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/ConvertBf16ArithToF32.cpp
@@ -313,8 +313,7 @@
} // namespace
-std::unique_ptr<OperationPass<mlir::ModuleOp>>
-createConvertBf16ArithToF32Pass() {
+std::unique_ptr<OperationPass<>> createConvertBf16ArithToF32Pass() {
return std::make_unique<ConvertBf16ArithToF32Pass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/ConvertBf16ToUInt16Buffers.cpp b/compiler/src/iree/compiler/Codegen/Common/ConvertBf16ToUInt16Buffers.cpp
index c38c5cd..1c56bd9 100644
--- a/compiler/src/iree/compiler/Codegen/Common/ConvertBf16ToUInt16Buffers.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/ConvertBf16ToUInt16Buffers.cpp
@@ -250,7 +250,7 @@
}
void runOnOperation() override {
- ModuleOp op = getOperation();
+ auto op = getOperation();
MLIRContext *ctx = &getContext();
Bf16EmulationConverter typeConverter;
@@ -311,8 +311,7 @@
// Public interface
//===----------------------------------------------------------------------===//
-std::unique_ptr<OperationPass<ModuleOp>>
-createConvertBf16ToUInt16BuffersPass() {
+std::unique_ptr<OperationPass<>> createConvertBf16ToUInt16BuffersPass() {
return std::make_unique<ConvertBf16ToUInt16BuffersPass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/EmulateNarrowType.cpp b/compiler/src/iree/compiler/Codegen/Common/EmulateNarrowType.cpp
index 3e4d4f4..229bcda 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EmulateNarrowType.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/EmulateNarrowType.cpp
@@ -165,7 +165,7 @@
// Public interface
//===----------------------------------------------------------------------===//
-std::unique_ptr<OperationPass<ModuleOp>> createEmulateNarrowTypePass() {
+std::unique_ptr<OperationPass<>> createEmulateNarrowTypePass() {
return std::make_unique<EmulateNarrowTypePass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUCheckResourceUsage.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUCheckResourceUsage.cpp
index 8b02f7b..d0d0c8c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUCheckResourceUsage.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUCheckResourceUsage.cpp
@@ -104,21 +104,20 @@
}
void GPUCheckResourceUsagePass::runOnOperation() {
- auto moduleOp = getOperation();
- for (auto funcOp : moduleOp.getOps<mlir::FunctionOpInterface>()) {
- unsigned limit = this->getSharedMemoryLimit
- ? this->getSharedMemoryLimit(funcOp)
- : 64 * 1024;
- if (failed(checkGPUAllocationSize(funcOp, limit,
- this->getIndexBitwidth
- ? this->getIndexBitwidth
- : getDatalayoutIndexBitwidth))) {
- return signalPassFailure();
- }
+ auto funcOp = getOperation();
+ unsigned limit = this->getSharedMemoryLimit
+ ? this->getSharedMemoryLimit(funcOp)
+ : 64 * 1024;
+ if (failed(checkGPUAllocationSize(funcOp, limit,
+ this->getIndexBitwidth
+ ? this->getIndexBitwidth
+ : getDatalayoutIndexBitwidth))) {
+ return signalPassFailure();
}
}
-std::unique_ptr<OperationPass<ModuleOp>> createGPUCheckResourceUsagePass(
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
+createGPUCheckResourceUsagePass(
std::function<unsigned(mlir::FunctionOpInterface)> getSharedMemoryLimit,
std::function<unsigned(mlir::FunctionOpInterface)> getIndexBitwidth) {
return std::make_unique<GPUCheckResourceUsagePass>(getSharedMemoryLimit,
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistribute.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistribute.cpp
index 0e0be74..45a3550 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistribute.cpp
@@ -6,6 +6,7 @@
#include "iree/compiler/Codegen/Common/GPU/PassDetail.h"
#include "iree/compiler/Codegen/Common/GPU/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
@@ -25,25 +26,24 @@
}
void runOnOperation() override {
auto funcOp = getOperation();
- if (!isEntryPoint(funcOp))
- return;
- auto workgroupSize = llvm::map_to_vector(
- getEntryPoint(funcOp)->getWorkgroupSize().value(),
- [&](Attribute attr) { return llvm::cast<IntegerAttr>(attr).getInt(); });
+ std::optional<SmallVector<int64_t>> workgroupSize =
+ getWorkgroupSize(funcOp);
+ if (!workgroupSize) {
+ return;
+ }
// TODO: Thread through subgroup size everywhere.
- std::optional<llvm::APInt> maybeSubgroupSize =
- getEntryPoint(funcOp)->getSubgroupSize();
+ std::optional<int64_t> maybeSubgroupSize = getSubgroupSize(funcOp);
// TODO: Don't hard code kCudaWarpSize here.
- int64_t subgroupSize =
- maybeSubgroupSize ? maybeSubgroupSize->getSExtValue() : kCudaWarpSize;
+ int64_t subgroupSize = maybeSubgroupSize.value_or(kCudaWarpSize);
IRRewriter rewriter(funcOp->getContext());
rewriter.setInsertionPointToStart(&funcOp.front());
DiagnosedSilenceableFailure result =
mlir::transform::gpu::mapNestedForallToThreadsImpl(
- rewriter, std::nullopt, funcOp, workgroupSize, subgroupSize, false);
+ rewriter, std::nullopt, funcOp, workgroupSize.value(), subgroupSize,
+ false);
if (!result.succeeded())
return signalPassFailure();
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeSharedMemoryCopy.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeSharedMemoryCopy.cpp
index 729865a..f4a7326 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeSharedMemoryCopy.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeSharedMemoryCopy.cpp
@@ -366,14 +366,12 @@
}
LogicalResult gpuDistributeSharedMemoryCopy(mlir::FunctionOpInterface funcOp) {
- FailureOr<IREE::HAL::ExecutableExportOp> exportOp = getEntryPoint(funcOp);
- if (failed(exportOp)) {
- // We cannot do anything because we do not have the workgroup size
- // information, but the pass did not fail.
- return success();
+ auto maybeWorkgroupSize = getWorkgroupSize(funcOp);
+ if (!maybeWorkgroupSize) {
+ return funcOp.emitOpError("failed to distribute shared memory copy since "
+ "workgroup size isnt set");
}
-
- auto workgroupSize = getWorkgroupSize(exportOp.value());
+ SmallVector<int64_t> workgroupSize = maybeWorkgroupSize.value();
workgroupSize.resize(3, 1);
MLIRContext *context = funcOp.getContext();
SmallVector<linalg::GenericOp> copiesToWorkgroupMem;
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
index 9d5a49d..1887c28 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
@@ -8,7 +8,7 @@
#include "iree-dialects/Dialect/VectorExt/IR/VectorExtOps.h"
#include "iree/compiler/Codegen/Common/GPU/GPUPatterns.h"
#include "iree/compiler/Codegen/Common/GPU/GPUVectorDistribution.h"
-#include "iree/compiler/Codegen/Common/VectorLayoutAnalysis.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/Utils.h"
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp
index 9a38cbc..abddf1e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp
@@ -332,13 +332,14 @@
}
void runOnOperation() override {
auto funcOp = getOperation();
- if (!isEntryPoint(funcOp))
- return;
- auto workgroupSize = llvm::map_to_vector(
- getEntryPoint(funcOp)->getWorkgroupSize().value(),
- [&](Attribute attr) { return llvm::cast<IntegerAttr>(attr).getInt(); });
- if (failed(tileParallelDims(funcOp, workgroupSize, distributeToWarp))) {
+ std::optional<SmallVector<int64_t>> workgroupSize =
+ getWorkgroupSize(funcOp);
+ if (!workgroupSize) {
+ return;
+ }
+ if (failed(tileParallelDims(funcOp, workgroupSize.value(),
+ distributeToWarp))) {
return signalPassFailure();
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h
index 6d3fb1f..eabe4b4 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h
@@ -82,7 +82,8 @@
// This size is used to check the allocation space required for memrefs of
// indices. If this function is nullptr, this pass will query the datalayout to
// get the index size.
-std::unique_ptr<OperationPass<ModuleOp>> createGPUCheckResourceUsagePass(
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
+createGPUCheckResourceUsagePass(
std::function<unsigned(mlir::FunctionOpInterface)> getSharedMemoryLimit =
nullptr,
std::function<unsigned(mlir::FunctionOpInterface)> getIndexBitwidth =
@@ -145,6 +146,8 @@
// `getWarpSize` is for deciding the warp size to use; it takes the
// current function containing those vector ops as the argument.
// If nullptr, warp size 32 will be used.
+// TODO: This kind of call back function is a really really bad idea
+// This should be easier to resolve than doing this.
std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
createConvertVectorReductionToGPUPass(
bool expandSubgroupReduction = true,
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
index c76325a..3bffe60 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -14,7 +14,7 @@
//===---------------------------------------------------------------------===//
def GPUCheckResourceUsage :
- Pass<"iree-codegen-gpu-check-resource-usage", "ModuleOp"> {
+ InterfacePass<"iree-codegen-gpu-check-resource-usage", "mlir::FunctionOpInterface"> {
let summary = "Checks GPU specific resource usage constraints like shared memory limits";
let constructor = "mlir::iree_compiler::createGPUCheckResourceUsagePass()";
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp
index 69799e2..55e6bec 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp
@@ -6,6 +6,7 @@
#include "iree/compiler/Codegen/Common/GPU/PassDetail.h"
#include "iree/compiler/Codegen/Common/GPU/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
@@ -225,9 +226,14 @@
debugPrint(funcOp, "after step #1: preprocessing reduction ops");
- auto workgroupSize = llvm::map_to_vector(
- getEntryPoint(funcOp)->getWorkgroupSize().value(),
- [&](Attribute attr) { return llvm::cast<IntegerAttr>(attr).getInt(); });
+ std::optional<SmallVector<int64_t>> maybeWorkgroupSize =
+ getWorkgroupSize(funcOp);
+ if (!maybeWorkgroupSize) {
+ funcOp->emitOpError(
+ "expected workgroup size to be set as part of `translation_info`");
+ return signalPassFailure();
+ }
+ SmallVector<int64_t> &workgroupSize = maybeWorkgroupSize.value();
assert(workgroupSize[1] == 1 && workgroupSize[2] == 1);
// 2. Create the warp op and move the function body into it.
const int groupSize = workgroupSize[0];
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_check_resource_usage.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_check_resource_usage.mlir
index 7c14175..711aebb 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_check_resource_usage.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_check_resource_usage.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --iree-codegen-gpu-check-resource-usage %s --verify-diagnostics -split-input-file | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-check-resource-usage))" %s --verify-diagnostics -split-input-file | FileCheck %s
module {
// expected-error @+1 {{uses 274432 bytes of shared memory; exceeded the limit of 65536 bytes}}
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute.mlir
index 9c474e1..b99bfa9 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute.mlir
@@ -1,45 +1,37 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-gpu-distribute, cse)))))" %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-distribute, cse))" %s | FileCheck %s
-hal.executable private @add_tensor {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>) {
- hal.executable.export public @add_tensor ordinal(0)
- layout(#hal.pipeline.layout<push_constants = 0,
- sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorize>,
- workgroup_size = [64 : index, 1 : index, 1 : index]} {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
+#map = affine_map<()[s0] -> (s0 * 256)>
+#map1 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
+#map2 = affine_map<(d0) -> (d0 * 4)>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1]>
+module {
+ func.func @add_tensor() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c64 = arith.constant 64 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32>
+ memref.assume_alignment %0, 64 : memref<233x1024xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32>
+ memref.assume_alignment %1, 64 : memref<233x1024xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32>
+ memref.assume_alignment %2, 64 : memref<233x1024xf32>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %3 = affine.apply #map()[%workgroup_id_x]
+ %subview = memref.subview %2[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1>
+ %subview_0 = memref.subview %0[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1>
+ %subview_1 = memref.subview %1[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1>
+ scf.forall (%arg0) in (%c64) {
+ %4 = affine.apply #map2(%arg0)
+ %subview_2 = memref.subview %subview[0, %4] [1, 4] [1, 1] : memref<1x256xf32, #map1> to memref<1x4xf32, #map1>
+ %5 = vector.transfer_read %subview_0[%c0, %4], %cst {in_bounds = [true]} : memref<1x256xf32, #map1>, vector<4xf32>
+ %6 = vector.transfer_read %subview_1[%c0, %4], %cst {in_bounds = [true]} : memref<1x256xf32, #map1>, vector<4xf32>
+ %7 = arith.addf %5, %6 : vector<4xf32>
+ vector.transfer_write %7, %subview_2[%c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x4xf32, #map1>
+ } {mapping = [#gpu.thread<x>]}
+ return
}
- builtin.module {
- func.func @add_tensor() {
- %cst = arith.constant 0.000000e+00 : f32
- %c64 = arith.constant 64 : index
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32>
- memref.assume_alignment %0, 64 : memref<233x1024xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32>
- memref.assume_alignment %1, 64 : memref<233x1024xf32>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32>
- memref.assume_alignment %2, 64 : memref<233x1024xf32>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %3 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_id_x]
- %4 = memref.subview %2[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
- %5 = memref.subview %0[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
- %6 = memref.subview %1[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
- scf.forall (%arg0) in (%c64) shared_outs() -> () {
- %7 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg0)
- %8 = memref.subview %4[0, %7] [1, 4] [1, 1] : memref<1x256xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>> to memref<1x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
- %9 = vector.transfer_read %5[%c0, %7], %cst {in_bounds = [true]} : memref<1x256xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<4xf32>
- %10 = vector.transfer_read %6[%c0, %7], %cst {in_bounds = [true]} : memref<1x256xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, vector<4xf32>
- %11 = arith.addf %9, %10 : vector<4xf32>
- vector.transfer_write %11, %8[%c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x4xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
- } {mapping = [#gpu.thread<x>]}
- return
- }
- }
-}
}
// CHECK: #[[$MAP:.*]] = affine_map<(d0) -> (d0 * 4)>
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_shared_memory.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_shared_memory.mlir
index 3485646..af46bb3 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_shared_memory.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_shared_memory.mlir
@@ -1,264 +1,214 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-gpu-distribute-shared-memory-copy, fold-memref-alias-ops, canonicalize, cse)))))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-codegen-gpu-distribute-shared-memory-copy, fold-memref-alias-ops, canonicalize, cse))' %s | FileCheck %s
-// CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 32 + s0 floordiv 4)>
-// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>
-// CHECK-DAG: #[[$MAP2:.*]] = affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 32 + s0 floordiv 4 + 32)>
-// CHECK-DAG: #[[$MAP3:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 128)>
-// CHECK-DAG: #[[$MAP4:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 128 + 128)>
-// CHECK-DAG: #[[$MAP5:.*]] = affine_map<()[s0, s1, s2] -> (s0 * 4 + s1 * 128 + s2 * 512)>
-
+#executable_target = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 4, 1]>
#map0 = affine_map<()[s0, s1, s2] -> (s0 * 4 + s1 * 128 + s2 * 512)>
+module {
+ memref.global "private" @__shared_memory___1 : memref<3x512xf32, 3>
+ memref.global "private" @__shared_memory___0 : memref<256x4xf32, 3>
+ memref.global "private" @__shared_memory__ : memref<64x16xf32, 3>
+ func.func @shared_mem_cpy(
+ %m0 : memref<64x16xf32>, %m1 : memref<256x4xf32>, %m2 : memref<3x512xf32>)
+ attributes {hal.executable.target = #executable_target, translation_info = #translation_info} {
+ %c0 = arith.constant 0 : index
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @shared_mem_cpy {
- hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @shared_mem_cpy layout(#pipeline_layout) attributes {
- workgroup_size = [32: index, 4: index, 1:index]
- } {
- ^bb0(%arg0: !hal.device, %arg1 : index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
+ %0 = "affine.apply"(%c0) {map = affine_map<(d0) -> (d0)>} : (index) -> (index)
+ %sm0 = memref.get_global @__shared_memory__ : memref<64x16xf32, 3>
+ %sm1 = memref.get_global @__shared_memory___0 : memref<256x4xf32, 3>
+ %sm2 = memref.get_global @__shared_memory___1 : memref<3x512xf32, 3>
+ gpu.barrier
+
+ linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
+ iterator_types = ["parallel", "parallel"]}
+ ins(%m0 : memref<64x16xf32>)
+ outs(%sm0 : memref<64x16xf32, 3>)
+ attrs= {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
+ ^bb0(%arg4: f32, %s: f32): // no predecessors
+ linalg.yield %arg4 : f32
}
- builtin.module {
- memref.global "private" @__shared_memory___1 : memref<3x512xf32, 3>
- memref.global "private" @__shared_memory___0 : memref<256x4xf32, 3>
- memref.global "private" @__shared_memory__ : memref<64x16xf32, 3>
- // CHECK-LABEL: @shared_mem_cpy(
- func.func @shared_mem_cpy(
- %m0 : memref<64x16xf32>, %m1 : memref<256x4xf32>, %m2 : memref<3x512xf32>) {
- %c0 = arith.constant 0 : index
- %0 = "affine.apply"(%c0) {map = affine_map<(d0) -> (d0)>} : (index) -> (index)
- %sm0 = memref.get_global @__shared_memory__ : memref<64x16xf32, 3>
- %sm1 = memref.get_global @__shared_memory___0 : memref<256x4xf32, 3>
- %sm2 = memref.get_global @__shared_memory___1 : memref<3x512xf32, 3>
- gpu.barrier
- // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
- // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
- // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
- // CHECK-DAG: %[[TX:.*]] = gpu.thread_id x
- // CHECK-DAG: %[[TY:.*]] = gpu.thread_id y
- // CHECK-DAG: %[[TZ:.*]] = gpu.thread_id z
-
- // CHECK-DAG: %[[Y0:.*]] = affine.apply #[[$MAP0]]()[%[[TX]], %[[TY]], %[[TZ]]]
- // CHECK-DAG: %[[X0:.*]] = affine.apply #[[$MAP1]]()[%[[TX]]]
- // CHECK: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[Y0]], %[[X0]]], %{{.*}} {in_bounds = [true, true]} : memref<64x16xf32>, vector<1x4xf32>
- // CHECK: vector.transfer_write %[[R0]], %{{.*}}[%[[Y0]], %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, 3>
- // CHECK-DAG: %[[Y1:.*]] = affine.apply #[[$MAP2]]()[%[[TX]], %[[TY]], %[[TZ]]]
- // CHECK: %[[R1:.*]] = vector.transfer_read %{{.*}}[%[[Y1]], %[[X0]]], %{{.*}} {in_bounds = [true, true]} : memref<64x16xf32>, vector<1x4xf32>
- // CHECK: vector.transfer_write %[[R1]], %{{.*}}[%[[Y1]], %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, 3>
-
- linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%m0 : memref<64x16xf32>)
- outs(%sm0 : memref<64x16xf32, 3>)
- attrs= {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
- ^bb0(%arg4: f32, %s: f32): // no predecessors
- linalg.yield %arg4 : f32
- }
-
- // CHECK: %[[Y1:.*]] = affine.apply #[[$MAP3]]()[%[[TX]], %[[TY]], %[[TZ]]]
- // CHECK: %[[R2:.*]] = vector.transfer_read %{{.*}}[%[[Y1]], %[[C0]]], %{{.*}} {in_bounds = [true, true]} : memref<256x4xf32>, vector<1x4xf32>
- // CHECK: vector.transfer_write %[[R2]], %{{.*}}[%[[Y1]], %[[C0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<256x4xf32, 3>
- // CHECK: %[[Y2:.*]] = affine.apply #[[$MAP4]]()[%[[TX]], %[[TY]], %[[TZ]]]
- // CHECK: %[[R3:.*]] = vector.transfer_read %{{.*}}[%[[Y2]], %[[C0]]], %{{.*}} {in_bounds = [true, true]} : memref<256x4xf32>, vector<1x4xf32>
- // CHECK: vector.transfer_write %[[R3]], %{{.*}}[%[[Y2]], %[[C0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<256x4xf32, 3>
-
- linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%m1 : memref<256x4xf32>)
- outs(%sm1 : memref<256x4xf32, 3>)
- attrs= {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
- ^bb0(%arg4: f32, %s: f32): // no predecessors
- linalg.yield %arg4 : f32
- }
-
- // CHECK: %[[X1:.*]] = affine.apply #[[$MAP5]]()[%[[TX]], %[[TY]], %[[TZ]]]
- // CHECK: %[[R4:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[X1]]], %{{.*}} {in_bounds = [true, true]} : memref<3x512xf32>, vector<1x4xf32>
- // CHECK: vector.transfer_write %[[R4]], %{{.*}}[%[[C0]], %[[X1]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<3x512xf32, 3>
- // CHECK: %[[R5:.*]] = vector.transfer_read %{{.*}}[%[[C1]], %[[X1]]], %{{.*}} {in_bounds = [true, true]} : memref<3x512xf32>, vector<1x4xf32>
- // CHECK: vector.transfer_write %[[R5]], %{{.*}}[%[[C1]], %[[X1]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<3x512xf32, 3>
- // CHECK: %[[R6:.*]] = vector.transfer_read %{{.*}}[%[[C2]], %[[X1]]], %{{.*}} {in_bounds = [true, true]} : memref<3x512xf32>, vector<1x4xf32>
- // CHECK: vector.transfer_write %[[R6]], %{{.*}}[%[[C2]], %[[X1]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<3x512xf32, 3>
-
- linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%m2 : memref<3x512xf32>)
- outs(%sm2 : memref<3x512xf32, 3>)
- attrs= {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
- ^bb0(%arg4: f32, %s: f32): // no predecessors
- linalg.yield %arg4 : f32
- }
- gpu.barrier
- return
- }
+ linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
+ iterator_types = ["parallel", "parallel"]}
+ ins(%m1 : memref<256x4xf32>)
+ outs(%sm1 : memref<256x4xf32, 3>)
+ attrs= {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
+ ^bb0(%arg4: f32, %s: f32): // no predecessors
+ linalg.yield %arg4 : f32
}
+
+ linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
+ iterator_types = ["parallel", "parallel"]}
+ ins(%m2 : memref<3x512xf32>)
+ outs(%sm2 : memref<3x512xf32, 3>)
+ attrs= {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
+ ^bb0(%arg4: f32, %s: f32): // no predecessors
+ linalg.yield %arg4 : f32
+ }
+ gpu.barrier
+ return
}
}
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 32 + s0 floordiv 4)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>
+// CHECK-DAG: #[[$MAP2:.*]] = affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 32 + s0 floordiv 4 + 32)>
+// CHECK-DAG: #[[$MAP3:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 128)>
+// CHECK-DAG: #[[$MAP4:.*]] = affine_map<()[s0, s1, s2] -> (s0 + s1 * 32 + s2 * 128 + 128)>
+// CHECK-DAG: #[[$MAP5:.*]] = affine_map<()[s0, s1, s2] -> (s0 * 4 + s1 * 128 + s2 * 512)>
+// CHECK-LABEL: @shared_mem_cpy(
+
+// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[TX:.*]] = gpu.thread_id x
+// CHECK-DAG: %[[TY:.*]] = gpu.thread_id y
+// CHECK-DAG: %[[TZ:.*]] = gpu.thread_id z
+
+// CHECK-DAG: %[[Y0:.*]] = affine.apply #[[$MAP0]]()[%[[TX]], %[[TY]], %[[TZ]]]
+// CHECK-DAG: %[[X0:.*]] = affine.apply #[[$MAP1]]()[%[[TX]]]
+// CHECK: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[Y0]], %[[X0]]], %{{.*}} {in_bounds = [true, true]} : memref<64x16xf32>, vector<1x4xf32>
+// CHECK: vector.transfer_write %[[R0]], %{{.*}}[%[[Y0]], %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, 3>
+// CHECK-DAG: %[[Y1:.*]] = affine.apply #[[$MAP2]]()[%[[TX]], %[[TY]], %[[TZ]]]
+// CHECK: %[[R1:.*]] = vector.transfer_read %{{.*}}[%[[Y1]], %[[X0]]], %{{.*}} {in_bounds = [true, true]} : memref<64x16xf32>, vector<1x4xf32>
+// CHECK: vector.transfer_write %[[R1]], %{{.*}}[%[[Y1]], %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, 3>
+
+// CHECK: %[[Y1:.*]] = affine.apply #[[$MAP3]]()[%[[TX]], %[[TY]], %[[TZ]]]
+// CHECK: %[[R2:.*]] = vector.transfer_read %{{.*}}[%[[Y1]], %[[C0]]], %{{.*}} {in_bounds = [true, true]} : memref<256x4xf32>, vector<1x4xf32>
+// CHECK: vector.transfer_write %[[R2]], %{{.*}}[%[[Y1]], %[[C0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<256x4xf32, 3>
+// CHECK: %[[Y2:.*]] = affine.apply #[[$MAP4]]()[%[[TX]], %[[TY]], %[[TZ]]]
+// CHECK: %[[R3:.*]] = vector.transfer_read %{{.*}}[%[[Y2]], %[[C0]]], %{{.*}} {in_bounds = [true, true]} : memref<256x4xf32>, vector<1x4xf32>
+// CHECK: vector.transfer_write %[[R3]], %{{.*}}[%[[Y2]], %[[C0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<256x4xf32, 3>
+
+// CHECK: %[[X1:.*]] = affine.apply #[[$MAP5]]()[%[[TX]], %[[TY]], %[[TZ]]]
+// CHECK: %[[R4:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[X1]]], %{{.*}} {in_bounds = [true, true]} : memref<3x512xf32>, vector<1x4xf32>
+// CHECK: vector.transfer_write %[[R4]], %{{.*}}[%[[C0]], %[[X1]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<3x512xf32, 3>
+// CHECK: %[[R5:.*]] = vector.transfer_read %{{.*}}[%[[C1]], %[[X1]]], %{{.*}} {in_bounds = [true, true]} : memref<3x512xf32>, vector<1x4xf32>
+// CHECK: vector.transfer_write %[[R5]], %{{.*}}[%[[C1]], %[[X1]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<3x512xf32, 3>
+// CHECK: %[[R6:.*]] = vector.transfer_read %{{.*}}[%[[C2]], %[[X1]]], %{{.*}} {in_bounds = [true, true]} : memref<3x512xf32>, vector<1x4xf32>
+// CHECK: vector.transfer_write %[[R6]], %{{.*}}[%[[C2]], %[[X1]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<3x512xf32, 3>
+
// -----
-// CHECK-DAG: #[[$OFFSET_MAP:.+]] = affine_map<()[s0] -> (s0 * 4)>
+#executable_target = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 8, 1]>
+module {
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>]>
-]>
-
-hal.executable private @unaligned_shared_memory_copy {
- hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @unaligned_shared_memory_copy layout(#pipeline_layout) attributes {
- workgroup_size = [32: index, 8: index, 1:index]
- } {
- ^bb0(%arg0: !hal.device, %arg1 : index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
+ func.func @unaligned_shared_memory_copy(
+ %global : memref<56x32xf32, strided<[128, 1], offset: ?>>, %shared : memref<56x32xf32, 3>)
+ attributes {hal.executable.target = #executable_target, translation_info = #translation_info} {
+ linalg.generic {
+ indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
+ iterator_types = ["parallel", "parallel"]
}
- builtin.module {
-
- // CHECK-LABEL: func.func @unaligned_shared_memory_copy
- // CHECK-SAME: (%[[GLOBAL_MEM:.+]]: memref<56x32xf32, {{.+}}>, %[[SHARED_MEM:.+]]: memref<56x32xf32, 3>)
- func.func @unaligned_shared_memory_copy(
- %global : memref<56x32xf32, strided<[128, 1], offset: ?>>, %shared : memref<56x32xf32, 3>) {
-
- // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index
- // CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index
- // CHECK-DAG: %[[C56:.+]] = arith.constant 56 : index
- // CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index
-
- // CHECK-DAG: %[[TID_X:.+]] = gpu.thread_id x
- // CHECK-DAG: %[[TID_Y:.+]] = gpu.thread_id y
-
- // CHECK: scf.for %[[IV_Y:.+]] = %[[TID_Y]] to %[[C56]] step %[[C8]] {
- // CHECK: %[[OFFSET_X:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[TID_X]]]
- // CHECK: scf.for %[[IV_X:.+]] = %[[OFFSET_X]] to %[[C32]] step %[[C128]] {
- // CHECK: %[[GLOBAL_SUBVIEW:.+]] = memref.subview %[[GLOBAL_MEM]][%[[IV_Y]], %[[IV_X]]] [1, 4] [1, 1]
- // CHECK-SAME: : memref<56x32xf32, {{.+}}> to memref<1x4xf32, {{.+}}>
- // CHECK: %[[SHARED_SUBVIEW:.+]] = memref.subview %[[SHARED_MEM]][%[[IV_Y]], %[[IV_X]]] [1, 4] [1, 1]
- // CHECK-SAME: : memref<56x32xf32, 3> to memref<1x4xf32, strided<[32, 1], offset: ?>, 3>
- // CHECK: linalg.generic
- // CHECK-SAME: ins(%[[GLOBAL_SUBVIEW]]
- // CHECK-SAME: outs(%[[SHARED_SUBVIEW]]
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]
- }
- ins(%global : memref<56x32xf32, strided<[128, 1], offset: ?>>)
- outs(%shared : memref<56x32xf32, 3>)
- attrs = {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
- ^bb0(%arg0: f32, %arg1: f32):
- linalg.yield %arg0 : f32
- }
- return
- }
+ ins(%global : memref<56x32xf32, strided<[128, 1], offset: ?>>)
+ outs(%shared : memref<56x32xf32, 3>)
+ attrs = {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
+ ^bb0(%arg0: f32, %arg1: f32):
+ linalg.yield %arg0 : f32
}
+ return
}
}
+// CHECK-DAG: #[[$OFFSET_MAP:.+]] = affine_map<()[s0] -> (s0 * 4)>
+// CHECK-LABEL: func.func @unaligned_shared_memory_copy
+// CHECK-SAME: (%[[GLOBAL_MEM:.+]]: memref<56x32xf32, {{.+}}>, %[[SHARED_MEM:.+]]: memref<56x32xf32, 3>)
+
+// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index
+// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index
+// CHECK-DAG: %[[C56:.+]] = arith.constant 56 : index
+// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index
+
+// CHECK-DAG: %[[TID_X:.+]] = gpu.thread_id x
+// CHECK-DAG: %[[TID_Y:.+]] = gpu.thread_id y
+
+// CHECK: scf.for %[[IV_Y:.+]] = %[[TID_Y]] to %[[C56]] step %[[C8]] {
+// CHECK: %[[OFFSET_X:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[TID_X]]]
+// CHECK: scf.for %[[IV_X:.+]] = %[[OFFSET_X]] to %[[C32]] step %[[C128]] {
+// CHECK: %[[GLOBAL_SUBVIEW:.+]] = memref.subview %[[GLOBAL_MEM]][%[[IV_Y]], %[[IV_X]]] [1, 4] [1, 1]
+// CHECK-SAME: : memref<56x32xf32, {{.+}}> to memref<1x4xf32, {{.+}}>
+// CHECK: %[[SHARED_SUBVIEW:.+]] = memref.subview %[[SHARED_MEM]][%[[IV_Y]], %[[IV_X]]] [1, 4] [1, 1]
+// CHECK-SAME: : memref<56x32xf32, 3> to memref<1x4xf32, strided<[32, 1], offset: ?>, 3>
+// CHECK: linalg.generic
+// CHECK-SAME: ins(%[[GLOBAL_SUBVIEW]]
+// CHECK-SAME: outs(%[[SHARED_SUBVIEW]]
+
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>]>
-]>
-
-hal.executable private @zero_dim_shared_memory_copy {
- hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @zero_dim_shared_memory_copy layout(#pipeline_layout) attributes {
- workgroup_size = [32: index, 8: index, 1:index]
- } {
- ^bb0(%arg0: !hal.device, %arg1 : index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
+#executable_target = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 8, 1]>
+module {
+ func.func @zero_dim_shared_memory_copy(%global : memref<f32>, %shared : memref<f32>)
+ attributes {hal.executable.target = #executable_target, translation_info = #translation_info} {
+ linalg.generic {
+ indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>],
+ iterator_types = []
}
- builtin.module {
- // CHECK-LABEL: func.func @zero_dim_shared_memory_copy
- // CHECK-SAME: (%[[GLOBAL_MEM:.+]]: memref<f32>, %[[SHARED_MEM:.+]]: memref<f32>)
- func.func @zero_dim_shared_memory_copy(%global : memref<f32>, %shared : memref<f32>) {
- // CHECK: linalg.generic
- // CHECK-SAME: ins(%[[GLOBAL_MEM]]
- // CHECK-SAME: outs(%[[SHARED_MEM]]
- linalg.generic {
- indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>],
- iterator_types = []
- }
- ins(%global : memref<f32>)
- outs(%shared : memref<f32>)
- attrs = {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
- ^bb0(%in: f32, %out: f32):
- linalg.yield %in : f32
- }
- return
- }
+ ins(%global : memref<f32>)
+ outs(%shared : memref<f32>)
+ attrs = {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
}
+ return
}
}
+// CHECK-LABEL: func.func @zero_dim_shared_memory_copy
+// CHECK-SAME: (%[[GLOBAL_MEM:.+]]: memref<f32>, %[[SHARED_MEM:.+]]: memref<f32>)
+// CHECK: linalg.generic
+// CHECK-SAME: ins(%[[GLOBAL_MEM]]
+// CHECK-SAME: outs(%[[SHARED_MEM]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>]>
-]>
-
-hal.executable private @zero_dim_shared_memory_copy {
- hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @zero_dim_shared_memory_copy layout(#pipeline_layout) attributes {
- workgroup_size = [32: index, 8: index, 1:index]
- } {
- ^bb0(%arg0: !hal.device, %arg1 : index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
+#executable_target = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 8, 1]>
+module {
+ func.func @zero_dim_shared_memory_copy(%A: memref<1x32x128xi4>, %B: memref<1x128xf32>, %C: memref<1x128xi4>,
+ %SM: memref<1x32x128xf32, #gpu.address_space<workgroup>>)
+ attributes {hal.executable.target = #executable_target, translation_info = #translation_info} {
+ linalg.generic {
+ indexing_maps = [
+ affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+ affine_map<(d0, d1, d2) -> (d0, d2)>,
+ affine_map<(d0, d1, d2) -> (d0, d2)>,
+ affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+ ],
+ iterator_types = ["parallel", "parallel", "parallel"]
}
- builtin.module {
- func.func @zero_dim_shared_memory_copy(%A: memref<1x32x128xi4>, %B: memref<1x128xf32>, %C: memref<1x128xi4>,
- %SM: memref<1x32x128xf32, #gpu.address_space<workgroup>>) {
- linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>
- ],
- iterator_types = ["parallel", "parallel", "parallel"]
- }
- ins(%A, %B, %C : memref<1x32x128xi4>, memref<1x128xf32>, memref<1x128xi4>)
- outs(%SM : memref<1x32x128xf32, #gpu.address_space<workgroup>>)
- attrs = {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
- ^bb0(%in: i4, %in_14: f32, %in_15: i4, %out: f32):
- %19 = arith.extui %in : i4 to i32
- %20 = arith.extui %in_15 : i4 to i32
- %21 = arith.subi %19, %20 : i32
- %22 = arith.sitofp %21 : i32 to f32
- %23 = arith.mulf %22, %in_14 : f32
- linalg.yield %23 : f32
- }
- return
- }
+ ins(%A, %B, %C : memref<1x32x128xi4>, memref<1x128xf32>, memref<1x128xi4>)
+ outs(%SM : memref<1x32x128xf32, #gpu.address_space<workgroup>>)
+ attrs = {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
+ ^bb0(%in: i4, %in_14: f32, %in_15: i4, %out: f32):
+ %19 = arith.extui %in : i4 to i32
+ %20 = arith.extui %in_15 : i4 to i32
+ %21 = arith.subi %19, %20 : i32
+ %22 = arith.sitofp %21 : i32 to f32
+ %23 = arith.mulf %22, %in_14 : f32
+ linalg.yield %23 : f32
}
+ return
}
}
// CHECK-LABEL: func.func @zero_dim_shared_memory_copy
// CHECK-SAME: (%[[A:.+]]: memref<1x32x128xi4>, %{{.+}}: memref<1x128xf32>, %[[C:.+]]: memref<1x128xi4>, %[[SM:.+]]: memref<1x32x128xf32, {{.*}}>)
-// CHECK: %[[A0:.+]] = vector.transfer_read %[[A]]
-// CHECK: %[[C0:.+]] = vector.transfer_read %[[C]]
-// CHECK: %[[A0E:.+]] = arith.extui %[[A0]] : vector<1x1x8xi4> to vector<1x1x8xi32>
-// CHECK: %[[C0E:.+]] = arith.extui %[[C0]] : vector<1x1x8xi4> to vector<1x1x8xi32>
-// CHECK: %[[SUB0:.+]] = arith.subi %[[A0E]], %[[C0E]] : vector<1x1x8xi32>
-// CHECK: %[[EXT0:.+]] = arith.sitofp %[[SUB0]] : vector<1x1x8xi32> to vector<1x1x8xf32>
-// CHECK: %[[MUL0:.+]] = arith.mulf %[[EXT0]], %{{.+}} : vector<1x1x8xf32>
-// CHECK: vector.transfer_write %[[MUL0]], %[[SM]]
+// CHECK: %[[A0:.+]] = vector.transfer_read %[[A]]
+// CHECK: %[[C0:.+]] = vector.transfer_read %[[C]]
+// CHECK: %[[A0E:.+]] = arith.extui %[[A0]] : vector<1x1x8xi4> to vector<1x1x8xi32>
+// CHECK: %[[C0E:.+]] = arith.extui %[[C0]] : vector<1x1x8xi4> to vector<1x1x8xi32>
+// CHECK: %[[SUB0:.+]] = arith.subi %[[A0E]], %[[C0E]] : vector<1x1x8xi32>
+// CHECK: %[[EXT0:.+]] = arith.sitofp %[[SUB0]] : vector<1x1x8xi32> to vector<1x1x8xf32>
+// CHECK: %[[MUL0:.+]] = arith.mulf %[[EXT0]], %{{.+}} : vector<1x1x8xf32>
+// CHECK: vector.transfer_write %[[MUL0]], %[[SM]]
-// CHECK: %[[A1:.+]] = vector.transfer_read %[[A]]
-// CHECK: %[[C1:.+]] = vector.transfer_read %[[C]]
-// CHECK: %[[A1E:.+]] = arith.extui %[[A1]] : vector<1x1x8xi4> to vector<1x1x8xi32>
-// CHECK: %[[C1E:.+]] = arith.extui %[[C1]] : vector<1x1x8xi4> to vector<1x1x8xi32>
-// CHECK: %[[SUB1:.+]] = arith.subi %[[A1E]], %[[C1E]] : vector<1x1x8xi32>
-// CHECK: %[[EXT1:.+]] = arith.sitofp %[[SUB1]] : vector<1x1x8xi32> to vector<1x1x8xf32>
-// CHECK: %[[MUL1:.+]] = arith.mulf %[[EXT1]], %{{.+}} : vector<1x1x8xf32>
-// CHECK: vector.transfer_write %[[MUL1]], %[[SM]]
+// CHECK: %[[A1:.+]] = vector.transfer_read %[[A]]
+// CHECK: %[[C1:.+]] = vector.transfer_read %[[C]]
+// CHECK: %[[A1E:.+]] = arith.extui %[[A1]] : vector<1x1x8xi4> to vector<1x1x8xi32>
+// CHECK: %[[C1E:.+]] = arith.extui %[[C1]] : vector<1x1x8xi4> to vector<1x1x8xi32>
+// CHECK: %[[SUB1:.+]] = arith.subi %[[A1E]], %[[C1E]] : vector<1x1x8xi32>
+// CHECK: %[[EXT1:.+]] = arith.sitofp %[[SUB1]] : vector<1x1x8xi32> to vector<1x1x8xf32>
+// CHECK: %[[MUL1:.+]] = arith.mulf %[[EXT1]], %{{.+}} : vector<1x1x8xf32>
+// CHECK: vector.transfer_write %[[MUL1]], %[[SM]]
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_tile.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_tile.mlir
index ae8a862..b0b176e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_tile.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_tile.mlir
@@ -1,38 +1,30 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-gpu-tensor-tile, cse)))))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-tensor-tile, cse))" %s | FileCheck %s
-hal.executable private @add_tensor {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>) {
- hal.executable.export public @add_tensor ordinal(0)
- layout(#hal.pipeline.layout<push_constants = 0,
- sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorize>,
- workgroup_size = [64 : index, 1 : index, 1 : index]} {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
+#config = #iree_codegen.lowering_config<tile_sizes = [[1, 256]]>
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
+#map = affine_map<()[s0] -> (s0 * 256)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1]>
+module {
+ func.func @add_tensor() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<233x1024xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<233x1024xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<233x1024xf32>>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %3 = affine.apply #map()[%workgroup_id_x]
+ %4 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_y, %3], sizes = [1, 256], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<233x1024xf32>> -> tensor<1x256xf32>
+ %5 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %3], sizes = [1, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<233x1024xf32>> -> tensor<1x256xf32>
+ %6 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_y, %3], sizes = [1, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<233x1024xf32>> -> tensor<1x256xf32>
+ %7 = linalg.generic {indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%5, %6 : tensor<1x256xf32>, tensor<1x256xf32>) outs(%4 : tensor<1x256xf32>) attrs = {lowering_config = #config} {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %8 = arith.addf %in, %in_0 : f32
+ linalg.yield %8 : f32
+ } -> tensor<1x256xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [%workgroup_id_y, %3], sizes = [1, 256], strides = [1, 1] : tensor<1x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<233x1024xf32>>
+ return
}
- builtin.module {
- func.func @add_tensor() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<233x1024xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<233x1024xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<233x1024xf32>>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %3 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_id_x]
- %4 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_y, %3], sizes = [1, 256], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<233x1024xf32>> -> tensor<1x256xf32>
- %5 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %3], sizes = [1, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<233x1024xf32>> -> tensor<1x256xf32>
- %6 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_y, %3], sizes = [1, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<233x1024xf32>> -> tensor<1x256xf32>
- %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5, %6 : tensor<1x256xf32>, tensor<1x256xf32>) outs(%4 : tensor<1x256xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 256]]>} {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
- %8 = arith.addf %arg0, %arg1 : f32
- linalg.yield %8 : f32
- } -> tensor<1x256xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [%workgroup_id_y, %3], sizes = [1, 256], strides = [1, 1] : tensor<1x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<233x1024xf32>>
- return
- }
- }
-}
}
// CHECK: #[[$MAP:.*]] = affine_map<(d0) -> (d0 * 4)>
@@ -60,38 +52,31 @@
// -----
-hal.executable private @reduction {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>) {
- hal.executable.export public @reduction ordinal(0)
- layout(#hal.pipeline.layout<push_constants = 0,
- sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorize>,
- workgroup_size = [64 : index, 1 : index, 1 : index]} {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
+#config = #iree_codegen.lowering_config<tile_sizes = [[64, 4]]>
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
+#map = affine_map<()[s0] -> (s0 * 64)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+#map2 = affine_map<(d0, d1) -> (d0)>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1]>
+module {
+ func.func @reduction() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x384xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128xf32>>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %2 = affine.apply #map()[%workgroup_id_x]
+ %3 = flow.dispatch.tensor.load %1, offsets = [%2], sizes = [64], strides = [1] : !flow.dispatch.tensor<writeonly:tensor<128xf32>> -> tensor<64xf32>
+ %4 = flow.dispatch.tensor.load %0, offsets = [%2, 0], sizes = [64, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x384xf32>> -> tensor<64x384xf32>
+ %5 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%3 : tensor<64xf32>) -> tensor<64xf32>
+ %6 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "reduction"]} ins(%4 : tensor<64x384xf32>) outs(%5 : tensor<64xf32>) attrs = {lowering_config = #config} {
+ ^bb0(%in: f32, %out: f32):
+ %7 = arith.addf %in, %out : f32
+ linalg.yield %7 : f32
+ } -> tensor<64xf32>
+ flow.dispatch.tensor.store %6, %1, offsets = [%2], sizes = [64], strides = [1] : tensor<64xf32> -> !flow.dispatch.tensor<writeonly:tensor<128xf32>>
+ return
}
- builtin.module {
- func.func @reduction() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x384xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128xf32>>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
- %3 = flow.dispatch.tensor.load %1, offsets = [%2], sizes = [64], strides = [1] : !flow.dispatch.tensor<writeonly:tensor<128xf32>> -> tensor<64xf32>
- %4 = flow.dispatch.tensor.load %0, offsets = [%2, 0], sizes = [64, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x384xf32>> -> tensor<64x384xf32>
- %5 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4]]>} ins(%cst : f32) outs(%3 : tensor<64xf32>) -> tensor<64xf32>
- %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%4 : tensor<64x384xf32>) outs(%5 : tensor<64xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 4]]>} {
- ^bb0(%arg0: f32, %arg1: f32):
- %7 = arith.addf %arg0, %arg1 : f32
- linalg.yield %7 : f32
- } -> tensor<64xf32>
- flow.dispatch.tensor.store %6, %1, offsets = [%2], sizes = [64], strides = [1] : tensor<64xf32> -> !flow.dispatch.tensor<writeonly:tensor<128xf32>>
- return
- }
- }
-}
}
// CHECK-LABEL: func.func @reduction
@@ -120,54 +105,40 @@
// -----
-hal.executable private @reduction_broadcast {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>) {
- hal.executable.export public @reduction_broadcast ordinal(0)
- layout(#hal.pipeline.layout<push_constants = 0,
- sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>, <2, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorize>,
- workgroup_size = [64 : index, 1 : index, 1 : index]} {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
+#config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 4, 4]]>
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
+#map = affine_map<()[s0] -> (s0 * 64)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1]>
+module {
+ func.func @reduction_broadcast() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x32x10x4096xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x4096xf32>>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %2 = affine.apply #map()[%workgroup_id_x]
+ %3 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_y, %2, 0, 0], sizes = [1, 32, 10, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x32x10x4096xf32>> -> tensor<1x32x10x4096xf32>
+ %4 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %2, 0, 0], sizes = [1, 32, 10, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x4096xf32>> -> tensor<1x32x10x4096xf32>
+ %5 = tensor.empty() : tensor<1x32xf32>
+ %6 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%5 : tensor<1x32xf32>) -> tensor<1x32xf32>
+ %7 = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%4 : tensor<1x32x10x4096xf32>) outs(%6 : tensor<1x32xf32>) attrs = {lowering_config = #config} {
+ ^bb0(%in: f32, %out: f32):
+ %9 = arith.addf %in, %out : f32
+ linalg.yield %9 : f32
+ } -> tensor<1x32xf32>
+ %8 = linalg.generic {indexing_maps = [#map1, #map2, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %7 : tensor<1x32x10x4096xf32>, tensor<1x32xf32>) outs(%3 : tensor<1x32x10x4096xf32>) attrs = {lowering_config = #config} {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %9 = arith.addf %in, %in_0 : f32
+ linalg.yield %9 : f32
+ } -> tensor<1x32x10x4096xf32>
+ flow.dispatch.tensor.store %8, %1, offsets = [%workgroup_id_y, %2, 0, 0], sizes = [1, 32, 10, 4096], strides = [1, 1, 1, 1] : tensor<1x32x10x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x4096xf32>>
+ return
}
- builtin.module {
- func.func @reduction_broadcast() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x32x10x4096xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x32x10x4096xf32>>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
- %3 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_y, %2, 0, 0], sizes = [1, 32, 10, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x32x10x4096xf32>> -> tensor<1x32x10x4096xf32>
- %4 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %2, 0, 0], sizes = [1, 32, 10, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x10x4096xf32>> -> tensor<1x32x10x4096xf32>
- %5 = tensor.empty() : tensor<1x32xf32>
- %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 4, 4]]>} ins(%cst : f32) outs(%5 : tensor<1x32xf32>) -> tensor<1x32xf32>
- %7 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
- ins(%4 : tensor<1x32x10x4096xf32>) outs(%6 : tensor<1x32xf32>)
- attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 4, 4]]>} {
- ^bb0(%arg0: f32, %arg1: f32):
- %9 = arith.addf %arg0, %arg1 : f32
- linalg.yield %9 : f32
- } -> tensor<1x32xf32>
- %8 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
- iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
- ins(%4, %7 : tensor<1x32x10x4096xf32>, tensor<1x32xf32>) outs(%3 : tensor<1x32x10x4096xf32>)
- attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 4, 4]]>} {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
- %9 = arith.addf %arg0, %arg1 : f32
- linalg.yield %9 : f32
- } -> tensor<1x32x10x4096xf32>
- flow.dispatch.tensor.store %8, %1, offsets = [%workgroup_id_y, %2, 0, 0], sizes = [1, 32, 10, 4096], strides = [1, 1, 1, 1] : tensor<1x32x10x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x10x4096xf32>>
- return
- }
}
-}
-}
+
// Check that the parallel dimensions that didn't get distributed are being
// tiled with a serial loop. This happens because the broadcast has extra
// parallel dimension that won't get distributed by tile and distribute to
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
index 5f534a7..0a03966 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
@@ -305,131 +305,105 @@
#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 16]>
#layout2d = #iree_vector_ext.layout<#row_layout, #col_layout>
#layout1d = #iree_vector_ext.layout<#col_layout>
-// Dummy hal executable ops to set the subgroup size which is required for distributing reductions.
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>]>]>
-hal.executable private @reduction_dispatch {
- hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
- hal.executable.export public @distribute_reduction_f16 ordinal(0) layout(#pipeline_layout) attributes {subgroup_size = 64 : index} {
- ^bb0(%arg0: !hal.device):
- %c2 = arith.constant 2 : index
- %c32 = arith.constant 32 : index
- %c1 = arith.constant 1 : index
- hal.return %c2, %c32, %c1 : index, index, index
- }
- builtin.module {
- func.func @distribute_reduction_f16(%source: vector<16x16xf16>, %init: vector<16xf16>) -> vector<16xf16> {
- // CHECK: func.func @distribute_reduction_f16(%[[ARG0:[a-zA-Z0-9_]+]]: vector<16x16xf16>, %[[ARG1:[a-zA-Z0-9_]+]]:
- // CHECK-SAME: vector<16xf16>) -> vector<16xf16> {
- // CHECK-DAG: %[[C32_I32:.+]] = arith.constant 32 : i32
- // CHECK-DAG: %[[C64_I32:.+]] = arith.constant 64 : i32
- // CHECK-DAG: %[[C16_I32:.+]] = arith.constant 16 : i32
- // CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<2xf16>
- // CHECK-DAG: %[[CST_0:.+]] = arith.constant dense<0.000000e+00> : vector<1xf16>
- // CHECK: %[[D0:.+]] = iree_vector_ext.to_simt %[[ARG1]] : vector<16xf16> -> vector<1xf16>
- // CHECK: %[[D1:.+]] = vector.extract %[[D0]][0] : f16 from vector<1xf16>
- // CHECK: %[[D2:.+]] = iree_vector_ext.to_simt %[[ARG0]] : vector<16x16xf16> -> vector<1x1x4xf16>
- // CHECK: %[[D3:.+]] = vector.extract %[[D2]][0, 0, 0] : f16 from vector<1x1x4xf16>
- // CHECK: %[[D4:.+]] = vector.insert %[[D3]], %[[CST]] [0] : f16 into vector<2xf16>
- // CHECK: %[[D5:.+]] = vector.extract %[[D2]][0, 0, 1] : f16 from vector<1x1x4xf16>
- // CHECK: %[[D6:.+]] = vector.insert %[[D5]], %[[D4]] [1] : f16 into vector<2xf16>
- // CHECK: %[[D7:.+]] = vector.extract %[[D2]][0, 0, 2] : f16 from vector<1x1x4xf16>
- // CHECK: %[[D8:.+]] = vector.insert %[[D7]], %[[D6]] [0] : f16 into vector<2xf16>
- // CHECK: %[[D9:.+]] = vector.extract %[[D2]][0, 0, 3] : f16 from vector<1x1x4xf16>
- // CHECK: %[[D10:.+]] = vector.insert %[[D9]], %[[D8]] [1] : f16 into vector<2xf16>
- // CHECK: %[[D11:.+]] = arith.maximumf %[[D6]], %[[D10]] : vector<2xf16>
- // CHECK: %[[D12:.+]] = vector.bitcast %[[D11]] : vector<2xf16> to vector<1xi32>
- // CHECK: %[[D13:.+]] = vector.extract %[[D12]][0] : i32 from vector<1xi32>
- // CHECK: %[[SHUFFLERESULT:.+]], %[[VALID:.+]] = gpu.shuffle xor %[[D13]], %[[C16_I32]], %[[C64_I32]] : i32
- // CHECK: %[[D14:.+]] = vector.broadcast %[[SHUFFLERESULT]] : i32 to vector<1xi32>
- // CHECK: %[[D15:.+]] = vector.bitcast %[[D14]] : vector<1xi32> to vector<2xf16>
- // CHECK: %[[D16:.+]] = arith.maximumf %[[D15]], %[[D11]] : vector<2xf16>
- // CHECK: %[[D17:.+]] = vector.bitcast %[[D16]] : vector<2xf16> to vector<1xi32>
- // CHECK: %[[D18:.+]] = vector.extract %[[D17]][0] : i32 from vector<1xi32>
- // CHECK: %[[SHUFFLERESULT_1:.+]], %[[VALID_2:.+]] = gpu.shuffle xor %[[D18]], %[[C32_I32]], %[[C64_I32]] : i32
- // CHECK: %[[D19:.+]] = vector.broadcast %[[SHUFFLERESULT_1]] : i32 to vector<1xi32>
- // CHECK: %[[D20:.+]] = vector.bitcast %[[D19]] : vector<1xi32> to vector<2xf16>
- // CHECK: %[[D21:.+]] = arith.maximumf %[[D20]], %[[D16]] : vector<2xf16>
- // CHECK: %[[D22:.+]] = vector.extract %[[D21]][0] : f16 from vector<2xf16>
- // CHECK: %[[D23:.+]] = vector.extract %[[D21]][1] : f16 from vector<2xf16>
- // CHECK: %[[D24:.+]] = arith.maximumf %[[D22]], %[[D23]] : f16
- // CHECK: %[[D25:.+]] = arith.maximumf %[[D24]], %[[D1]] : f16
- // CHECK: %[[D26:.+]] = vector.insert %[[D25]], %[[CST_0]] [0] : f16 into vector<1xf16>
- // CHECK: %[[D27:.+]] = iree_vector_ext.to_simd %[[D26]] : vector<1xf16> -> vector<16xf16>
- %result = vector.multi_reduction <maximumf>, %source, %init {
- "__vector_layout_test_anchor_operand_0" = #layout2d,
- "__vector_layout_test_anchor_operand_1" = #layout1d,
- "__vector_layout_test_anchor_result_0" = #layout1d
- } [0] : vector<16x16xf16> to vector<16xf16>
- func.return %result : vector<16xf16>
- }
- }
+#translation_info = #iree_codegen.translation_info<None subgroup_size = 64>
+module {
+ func.func @distribute_reduction_f16(%source: vector<16x16xf16>, %init: vector<16xf16>) -> vector<16xf16>
+ attributes {hal.executable.target = #executable_target_rocm_hsaco_fb, translation_info = #translation_info} {
+ %result = vector.multi_reduction <maximumf>, %source, %init {
+ "__vector_layout_test_anchor_operand_0" = #layout2d,
+ "__vector_layout_test_anchor_operand_1" = #layout1d,
+ "__vector_layout_test_anchor_result_0" = #layout1d
+ } [0] : vector<16x16xf16> to vector<16xf16>
+ func.return %result : vector<16xf16>
}
}
+// CHECK: func.func @distribute_reduction_f16(%[[ARG0:[a-zA-Z0-9_]+]]: vector<16x16xf16>, %[[ARG1:[a-zA-Z0-9_]+]]: vector<16xf16>) -> vector<16xf16>
+// CHECK-DAG: %[[C32_I32:.+]] = arith.constant 32 : i32
+// CHECK-DAG: %[[C64_I32:.+]] = arith.constant 64 : i32
+// CHECK-DAG: %[[C16_I32:.+]] = arith.constant 16 : i32
+// CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<2xf16>
+// CHECK-DAG: %[[CST_0:.+]] = arith.constant dense<0.000000e+00> : vector<1xf16>
+// CHECK: %[[D0:.+]] = iree_vector_ext.to_simt %[[ARG1]] : vector<16xf16> -> vector<1xf16>
+// CHECK: %[[D1:.+]] = vector.extract %[[D0]][0] : f16 from vector<1xf16>
+// CHECK: %[[D2:.+]] = iree_vector_ext.to_simt %[[ARG0]] : vector<16x16xf16> -> vector<1x1x4xf16>
+// CHECK: %[[D3:.+]] = vector.extract %[[D2]][0, 0, 0] : f16 from vector<1x1x4xf16>
+// CHECK: %[[D4:.+]] = vector.insert %[[D3]], %[[CST]] [0] : f16 into vector<2xf16>
+// CHECK: %[[D5:.+]] = vector.extract %[[D2]][0, 0, 1] : f16 from vector<1x1x4xf16>
+// CHECK: %[[D6:.+]] = vector.insert %[[D5]], %[[D4]] [1] : f16 into vector<2xf16>
+// CHECK: %[[D7:.+]] = vector.extract %[[D2]][0, 0, 2] : f16 from vector<1x1x4xf16>
+// CHECK: %[[D8:.+]] = vector.insert %[[D7]], %[[D6]] [0] : f16 into vector<2xf16>
+// CHECK: %[[D9:.+]] = vector.extract %[[D2]][0, 0, 3] : f16 from vector<1x1x4xf16>
+// CHECK: %[[D10:.+]] = vector.insert %[[D9]], %[[D8]] [1] : f16 into vector<2xf16>
+// CHECK: %[[D11:.+]] = arith.maximumf %[[D6]], %[[D10]] : vector<2xf16>
+// CHECK: %[[D12:.+]] = vector.bitcast %[[D11]] : vector<2xf16> to vector<1xi32>
+// CHECK: %[[D13:.+]] = vector.extract %[[D12]][0] : i32 from vector<1xi32>
+// CHECK: %[[SHUFFLERESULT:.+]], %[[VALID:.+]] = gpu.shuffle xor %[[D13]], %[[C16_I32]], %[[C64_I32]] : i32
+// CHECK: %[[D14:.+]] = vector.broadcast %[[SHUFFLERESULT]] : i32 to vector<1xi32>
+// CHECK: %[[D15:.+]] = vector.bitcast %[[D14]] : vector<1xi32> to vector<2xf16>
+// CHECK: %[[D16:.+]] = arith.maximumf %[[D15]], %[[D11]] : vector<2xf16>
+// CHECK: %[[D17:.+]] = vector.bitcast %[[D16]] : vector<2xf16> to vector<1xi32>
+// CHECK: %[[D18:.+]] = vector.extract %[[D17]][0] : i32 from vector<1xi32>
+// CHECK: %[[SHUFFLERESULT_1:.+]], %[[VALID_2:.+]] = gpu.shuffle xor %[[D18]], %[[C32_I32]], %[[C64_I32]] : i32
+// CHECK: %[[D19:.+]] = vector.broadcast %[[SHUFFLERESULT_1]] : i32 to vector<1xi32>
+// CHECK: %[[D20:.+]] = vector.bitcast %[[D19]] : vector<1xi32> to vector<2xf16>
+// CHECK: %[[D21:.+]] = arith.maximumf %[[D20]], %[[D16]] : vector<2xf16>
+// CHECK: %[[D22:.+]] = vector.extract %[[D21]][0] : f16 from vector<2xf16>
+// CHECK: %[[D23:.+]] = vector.extract %[[D21]][1] : f16 from vector<2xf16>
+// CHECK: %[[D24:.+]] = arith.maximumf %[[D22]], %[[D23]] : f16
+// CHECK: %[[D25:.+]] = arith.maximumf %[[D24]], %[[D1]] : f16
+// CHECK: %[[D26:.+]] = vector.insert %[[D25]], %[[CST_0]] [0] : f16 into vector<1xf16>
+// CHECK: %[[D27:.+]] = iree_vector_ext.to_simd %[[D26]] : vector<1xf16> -> vector<16xf16>
-// Dummy hal executable ops to set the subgroup size which is required for distributing reductions.
#executable_target_rocm_hsaco_fb2 = #hal.executable.target<"rocm", "rocm-hsaco-fb", {}>
-#pipeline_layout2 = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>]>]>
-hal.executable private @reduction_dispatch2 {
- hal.executable.variant public @rocm_hsaco_fb2 target(#executable_target_rocm_hsaco_fb2) {
- hal.executable.export public @distribute_reduction_f32 ordinal(0) layout(#pipeline_layout2) attributes {subgroup_size = 64 : index} {
- ^bb0(%arg0: !hal.device):
- %c2 = arith.constant 2 : index
- %c32 = arith.constant 32 : index
- %c1 = arith.constant 1 : index
- hal.return %c2, %c32, %c1 : index, index, index
- }
- builtin.module {
- func.func @distribute_reduction_f32(%source: vector<16x16xf32>, %init: vector<16xf32>) -> vector<16xf32> {
- // CHECK: func.func @distribute_reduction_f32(%[[ARG0:[a-zA-Z0-9_]+]]: vector<16x16xf32>, %[[ARG1:[a-zA-Z0-9_]+]]:
- // CHECK-SAME: vector<16xf32>) -> vector<16xf32> {
- // CHECK-DAG: %[[C32_I32:.+]] = arith.constant 32 : i32
- // CHECK-DAG: %[[C64_I32:.+]] = arith.constant 64 : i32
- // CHECK-DAG: %[[C16_I32:.+]] = arith.constant 16 : i32
- // CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1xf32>
- // CHECK: %[[D0:.+]] = iree_vector_ext.to_simt %[[ARG1]] : vector<16xf32> -> vector<1xf32>
- // CHECK: %[[D1:.+]] = vector.extract %[[D0]][0] : f32 from vector<1xf32>
- // CHECK: %[[D2:.+]] = iree_vector_ext.to_simt %[[ARG0]] : vector<16x16xf32> -> vector<1x1x4xf32>
- // CHECK: %[[D3:.+]] = vector.extract %[[D2]][0, 0, 0] : f32 from vector<1x1x4xf32>
- // CHECK: %[[D4:.+]] = vector.insert %[[D3]], %[[CST]] [0] : f32 into vector<1xf32>
- // CHECK: %[[D5:.+]] = vector.extract %[[D2]][0, 0, 1] : f32 from vector<1x1x4xf32>
- // CHECK: %[[D6:.+]] = vector.insert %[[D5]], %[[D4]] [0] : f32 into vector<1xf32>
- // CHECK: %[[D7:.+]] = arith.maximumf %[[D4]], %[[D6]] : vector<1xf32>
- // CHECK: %[[D8:.+]] = vector.extract %[[D2]][0, 0, 2] : f32 from vector<1x1x4xf32>
- // CHECK: %[[D9:.+]] = vector.insert %[[D8]], %[[D6]] [0] : f32 into vector<1xf32>
- // CHECK: %[[D10:.+]] = arith.maximumf %[[D7]], %[[D9]] : vector<1xf32>
- // CHECK: %[[D11:.+]] = vector.extract %[[D2]][0, 0, 3] : f32 from vector<1x1x4xf32>
- // CHECK: %[[D12:.+]] = vector.insert %[[D11]], %[[D9]] [0] : f32 into vector<1xf32>
- // CHECK: %[[D13:.+]] = arith.maximumf %[[D10]], %[[D12]] : vector<1xf32>
- // CHECK: %[[D14:.+]] = vector.bitcast %[[D13]] : vector<1xf32> to vector<1xi32>
- // CHECK: %[[D15:.+]] = vector.extract %[[D14]][0] : i32 from vector<1xi32>
- // CHECK: %[[SHUFFLERESULT:.+]], %[[VALID:.+]] = gpu.shuffle xor %[[D15]], %[[C16_I32]], %[[C64_I32]] : i32
- // CHECK: %[[D16:.+]] = vector.broadcast %[[SHUFFLERESULT]] : i32 to vector<1xi32>
- // CHECK: %[[D17:.+]] = vector.bitcast %[[D16]] : vector<1xi32> to vector<1xf32>
- // CHECK: %[[D18:.+]] = arith.maximumf %[[D17]], %[[D13]] : vector<1xf32>
- // CHECK: %[[D19:.+]] = vector.bitcast %[[D18]] : vector<1xf32> to vector<1xi32>
- // CHECK: %[[D20:.+]] = vector.extract %[[D19]][0] : i32 from vector<1xi32>
- // CHECK: %[[SHUFFLERESULT_0:.+]], %[[VALID_1:.+]] = gpu.shuffle xor %[[D20]], %[[C32_I32]], %[[C64_I32]] : i32
- // CHECK: %[[D21:.+]] = vector.broadcast %[[SHUFFLERESULT_0]] : i32 to vector<1xi32>
- // CHECK: %[[D22:.+]] = vector.bitcast %[[D21]] : vector<1xi32> to vector<1xf32>
- // CHECK: %[[D23:.+]] = arith.maximumf %[[D22]], %[[D18]] : vector<1xf32>
- // CHECK: %[[D24:.+]] = vector.extract %[[D23]][0] : f32 from vector<1xf32>
- // CHECK: %[[D25:.+]] = arith.maximumf %[[D24]], %[[D1]] : f32
- // CHECK: %[[D26:.+]] = vector.insert %[[D25]], %[[CST]] [0] : f32 into vector<1xf32>
- // CHECK: %[[D27:.+]] = iree_vector_ext.to_simd %[[D26]] : vector<1xf32> -> vector<16xf32>
- %result = vector.multi_reduction <maximumf>, %source, %init {
- "__vector_layout_test_anchor_operand_0" = #layout2d,
- "__vector_layout_test_anchor_operand_1" = #layout1d,
- "__vector_layout_test_anchor_result_0" = #layout1d
- } [0] : vector<16x16xf32> to vector<16xf32>
- func.return %result : vector<16xf32>
- }
- }
+module {
+ func.func @distribute_reduction_f32(%source: vector<16x16xf32>, %init: vector<16xf32>) -> vector<16xf32>
+ attributes {hal.executable.target = #executable_target_rocm_hsaco_fb, translation_info = #translation_info} {
+ %result = vector.multi_reduction <maximumf>, %source, %init {
+ "__vector_layout_test_anchor_operand_0" = #layout2d,
+ "__vector_layout_test_anchor_operand_1" = #layout1d,
+ "__vector_layout_test_anchor_result_0" = #layout1d
+ } [0] : vector<16x16xf32> to vector<16xf32>
+ func.return %result : vector<16xf32>
}
}
+// CHECK: func.func @distribute_reduction_f32(%[[ARG0:[a-zA-Z0-9_]+]]: vector<16x16xf32>, %[[ARG1:[a-zA-Z0-9_]+]]: vector<16xf32>) -> vector<16xf32>
+// CHECK-DAG: %[[C32_I32:.+]] = arith.constant 32 : i32
+// CHECK-DAG: %[[C64_I32:.+]] = arith.constant 64 : i32
+// CHECK-DAG: %[[C16_I32:.+]] = arith.constant 16 : i32
+// CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1xf32>
+// CHECK: %[[D0:.+]] = iree_vector_ext.to_simt %[[ARG1]] : vector<16xf32> -> vector<1xf32>
+// CHECK: %[[D1:.+]] = vector.extract %[[D0]][0] : f32 from vector<1xf32>
+// CHECK: %[[D2:.+]] = iree_vector_ext.to_simt %[[ARG0]] : vector<16x16xf32> -> vector<1x1x4xf32>
+// CHECK: %[[D3:.+]] = vector.extract %[[D2]][0, 0, 0] : f32 from vector<1x1x4xf32>
+// CHECK: %[[D4:.+]] = vector.insert %[[D3]], %[[CST]] [0] : f32 into vector<1xf32>
+// CHECK: %[[D5:.+]] = vector.extract %[[D2]][0, 0, 1] : f32 from vector<1x1x4xf32>
+// CHECK: %[[D6:.+]] = vector.insert %[[D5]], %[[D4]] [0] : f32 into vector<1xf32>
+// CHECK: %[[D7:.+]] = arith.maximumf %[[D4]], %[[D6]] : vector<1xf32>
+// CHECK: %[[D8:.+]] = vector.extract %[[D2]][0, 0, 2] : f32 from vector<1x1x4xf32>
+// CHECK: %[[D9:.+]] = vector.insert %[[D8]], %[[D6]] [0] : f32 into vector<1xf32>
+// CHECK: %[[D10:.+]] = arith.maximumf %[[D7]], %[[D9]] : vector<1xf32>
+// CHECK: %[[D11:.+]] = vector.extract %[[D2]][0, 0, 3] : f32 from vector<1x1x4xf32>
+// CHECK: %[[D12:.+]] = vector.insert %[[D11]], %[[D9]] [0] : f32 into vector<1xf32>
+// CHECK: %[[D13:.+]] = arith.maximumf %[[D10]], %[[D12]] : vector<1xf32>
+// CHECK: %[[D14:.+]] = vector.bitcast %[[D13]] : vector<1xf32> to vector<1xi32>
+// CHECK: %[[D15:.+]] = vector.extract %[[D14]][0] : i32 from vector<1xi32>
+// CHECK: %[[SHUFFLERESULT:.+]], %[[VALID:.+]] = gpu.shuffle xor %[[D15]], %[[C16_I32]], %[[C64_I32]] : i32
+// CHECK: %[[D16:.+]] = vector.broadcast %[[SHUFFLERESULT]] : i32 to vector<1xi32>
+// CHECK: %[[D17:.+]] = vector.bitcast %[[D16]] : vector<1xi32> to vector<1xf32>
+// CHECK: %[[D18:.+]] = arith.maximumf %[[D17]], %[[D13]] : vector<1xf32>
+// CHECK: %[[D19:.+]] = vector.bitcast %[[D18]] : vector<1xf32> to vector<1xi32>
+// CHECK: %[[D20:.+]] = vector.extract %[[D19]][0] : i32 from vector<1xi32>
+// CHECK: %[[SHUFFLERESULT_0:.+]], %[[VALID_1:.+]] = gpu.shuffle xor %[[D20]], %[[C32_I32]], %[[C64_I32]] : i32
+// CHECK: %[[D21:.+]] = vector.broadcast %[[SHUFFLERESULT_0]] : i32 to vector<1xi32>
+// CHECK: %[[D22:.+]] = vector.bitcast %[[D21]] : vector<1xi32> to vector<1xf32>
+// CHECK: %[[D23:.+]] = arith.maximumf %[[D22]], %[[D18]] : vector<1xf32>
+// CHECK: %[[D24:.+]] = vector.extract %[[D23]][0] : f32 from vector<1xf32>
+// CHECK: %[[D25:.+]] = arith.maximumf %[[D24]], %[[D1]] : f32
+// CHECK: %[[D26:.+]] = vector.insert %[[D25]], %[[CST]] [0] : f32 into vector<1xf32>
+// CHECK: %[[D27:.+]] = iree_vector_ext.to_simd %[[D26]] : vector<1xf32> -> vector<16xf32>
#transpose_test_layout = #iree_vector_ext.layout<<[LANEY], [32]>, <[LANEX, VECTORX], [4, 4]>>
-
func.func @distribute_transpose(%mem: memref<32x32xf16>, %mem1: memref<32x32xf16>) -> vector<32x16xf16> {
// CHECK: func.func @distribute_transpose(%[[MEM:.*]]: memref<32x32xf16>, %[[MEM1:.*]]: memref<32x32xf16>
%c0 = arith.constant 0 : index
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_distribute_shared_memory.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_distribute_shared_memory.mlir
index 23a795c..77f6f7f 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_distribute_shared_memory.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_distribute_shared_memory.mlir
@@ -1,77 +1,41 @@
// RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule | FileCheck %s
-// CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 32 + s0 floordiv 4)>
-// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>
-// CHECK-DAG: #[[$MAP2:.*]] = affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 32 + s0 floordiv 4 + 32)>
-// CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-
+#executable_target = #hal.executable.target<"cuda", "cuda-nvptx-fb">
#map1 = affine_map<(d0, d1) -> (d0, d1)>
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>
- ]>
-]>
-
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 4, 1]>
module attributes {transform.with_named_sequence} {
- hal.executable private @shared_mem_cpy {
- hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @shared_mem_cpy layout(#pipeline_layout) attributes {
- workgroup_size = [32: index, 4: index, 1:index]
- } {
- ^bb0(%arg0: !hal.device, %arg1 : index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- memref.global "private" @__shared_memory__ : memref<64x16xf32, #gpu.address_space<workgroup>>
- // CHECK-LABEL: @shared_mem_cpy(
+ memref.global "private" @__shared_memory__ : memref<64x16xf32, #gpu.address_space<workgroup>>
- func.func @shared_mem_cpy(%m0 : memref<64x16xf32, #hal.descriptor_type<storage_buffer>>) {
- %c0 = arith.constant 0 : index
+ func.func @shared_mem_cpy(%m0 : memref<64x16xf32, #hal.descriptor_type<storage_buffer>>)
+ attributes {hal.executable.target = #executable_target, translation_info = #translation_info} {
+ %c0 = arith.constant 0 : index
- %0 = "affine.apply"(%c0) {map = affine_map<(d0) -> (d0)>} : (index) -> (index)
- %sm0 = memref.get_global @__shared_memory__ : memref<64x16xf32, #gpu.address_space<workgroup>>
- gpu.barrier
- // CHECK-DAG: %[[TX:.*]] = gpu.thread_id x
- // CHECK-DAG: %[[TY:.*]] = gpu.thread_id y
- // CHECK-DAG: %[[TZ:.*]] = gpu.thread_id z
+ %0 = "affine.apply"(%c0) {map = affine_map<(d0) -> (d0)>} : (index) -> (index)
+ %sm0 = memref.get_global @__shared_memory__ : memref<64x16xf32, #gpu.address_space<workgroup>>
+ gpu.barrier
- // CHECK-DAG: %[[Y0:.*]] = affine.apply #[[$MAP0]]()[%[[TX]], %[[TY]], %[[TZ]]]
- // CHECK-DAG: %[[X0:.*]] = affine.apply #[[$MAP1]]()[%[[TX]]]
- // CHECK: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[Y0]], %[[X0]]], %{{.*}} {in_bounds = [true, true]} : memref<64x16xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
- // CHECK: vector.transfer_write %[[R0]], %{{.*}}[%[[Y0]], %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, #gpu.address_space<workgroup>>
- // CHECK-DAG: %[[Y1:.*]] = affine.apply #[[$MAP2]]()[%[[TX]], %[[TY]], %[[TZ]]]
- // CHECK: %[[R1:.*]] = vector.transfer_read %{{.*}}[%[[Y1]], %[[X0]]], %{{.*}} {in_bounds = [true, true]} : memref<64x16xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
- // CHECK: vector.transfer_write %[[R1]], %{{.*}}[%[[Y1]], %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, #gpu.address_space<workgroup>>
-
- linalg.generic {indexing_maps = [#map1, #map1],
- iterator_types = ["parallel", "parallel"]}
- ins(%m0 : memref<64x16xf32, #hal.descriptor_type<storage_buffer>>)
- outs(%sm0 : memref<64x16xf32, #gpu.address_space<workgroup>>) {
- ^bb0(%arg3: f32, %s: f32):
- linalg.yield %arg3 : f32
- }
-
- // CHECK: linalg.generic
- linalg.generic {indexing_maps = [#map1, #map1],
- iterator_types = ["parallel", "parallel"]}
- ins(%sm0 : memref<64x16xf32, #gpu.address_space<workgroup>>)
- outs(%sm0 : memref<64x16xf32, #gpu.address_space<workgroup>>) {
- ^bb0(%arg4: f32, %s: f32):
- %add = arith.addf %arg4, %arg4 : f32
- linalg.yield %add : f32
- }
-
- return
- }
- }
+ linalg.generic {indexing_maps = [#map1, #map1],
+ iterator_types = ["parallel", "parallel"]}
+ ins(%m0 : memref<64x16xf32, #hal.descriptor_type<storage_buffer>>)
+ outs(%sm0 : memref<64x16xf32, #gpu.address_space<workgroup>>) {
+ ^bb0(%arg3: f32, %s: f32):
+ linalg.yield %arg3 : f32
}
+ linalg.generic {indexing_maps = [#map1, #map1],
+ iterator_types = ["parallel", "parallel"]}
+ ins(%sm0 : memref<64x16xf32, #gpu.address_space<workgroup>>)
+ outs(%sm0 : memref<64x16xf32, #gpu.address_space<workgroup>>) {
+ ^bb0(%arg4: f32, %s: f32):
+ %add = arith.addf %arg4, %arg4 : f32
+ linalg.yield %add : f32
+ }
+
+ return
}
- transform.named_sequence @__transform_main(%variant_op: !transform.any_op) {
- %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.named_sequence @__transform_main(%top_level: !transform.any_op) {
+ %func = transform.structured.match ops{["func.func"]} in %top_level : (!transform.any_op) -> !transform.any_op
transform.iree.gpu_distribute_shared_memory_copy %func : (!transform.any_op) -> ()
transform.apply_patterns to %func {
transform.apply_patterns.memref.fold_memref_alias_ops
@@ -82,3 +46,20 @@
transform.yield
}
}
+// CHECK-DAG: #[[$MAP0:.*]] = affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 32 + s0 floordiv 4)>
+// CHECK-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>
+// CHECK-DAG: #[[$MAP2:.*]] = affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 32 + s0 floordiv 4 + 32)>
+// CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: @shared_mem_cpy(
+// CHECK-DAG: %[[TX:.*]] = gpu.thread_id x
+// CHECK-DAG: %[[TY:.*]] = gpu.thread_id y
+// CHECK-DAG: %[[TZ:.*]] = gpu.thread_id z
+
+// CHECK-DAG: %[[Y0:.*]] = affine.apply #[[$MAP0]]()[%[[TX]], %[[TY]], %[[TZ]]]
+// CHECK-DAG: %[[X0:.*]] = affine.apply #[[$MAP1]]()[%[[TX]]]
+// CHECK: %[[R0:.*]] = vector.transfer_read %{{.*}}[%[[Y0]], %[[X0]]], %{{.*}} {in_bounds = [true, true]} : memref<64x16xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
+// CHECK: vector.transfer_write %[[R0]], %{{.*}}[%[[Y0]], %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, #gpu.address_space<workgroup>>
+// CHECK-DAG: %[[Y1:.*]] = affine.apply #[[$MAP2]]()[%[[TX]], %[[TY]], %[[TZ]]]
+// CHECK: %[[R1:.*]] = vector.transfer_read %{{.*}}[%[[Y1]], %[[X0]]], %{{.*}} {in_bounds = [true, true]} : memref<64x16xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
+// CHECK: vector.transfer_write %[[R1]], %{{.*}}[%[[Y1]], %[[X0]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<64x16xf32, #gpu.address_space<workgroup>>
+// CHECK: linalg.generic
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
index e576576..fa2c52d 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
@@ -1,45 +1,34 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-vector-reduction-to-gpu, cse)))))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-codegen-vector-reduction-to-gpu, cse))' %s | FileCheck %s
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @simple_reduce {
- hal.executable.variant @cuda target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export @simple_reduce layout(#pipeline_layout) attributes {
- workgroup_size = [32 : index, 1 : index, 1 : index]
+#map = affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)>
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
+module {
+ func.func @simple_reduce() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation_info} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant dense<0.000000e+00> : vector<1xf32>
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %cst_1 = arith.constant dense<3.840000e+02> : vector<1xf32>
+ %c32 = arith.constant 32 : index
+ %c384 = arith.constant 384 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x384xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128xf32>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %thread_id_x = gpu.thread_id x
+ %2 = affine.apply #map()[%thread_id_x, %workgroup_id_x]
+ %3 = scf.for %arg0 = %c0 to %c384 step %c32 iter_args(%arg1 = %cst) -> (vector<1xf32>) {
+ %5 = vector.transfer_read %0[%2, %arg0], %cst_0 {in_bounds = [true]} : memref<128x384xf32>, vector<32xf32>
+ %6 = vector.broadcast %5 : vector<32xf32> to vector<1x32xf32>
+ %7 = vector.multi_reduction <add>, %6, %arg1 [1] : vector<1x32xf32> to vector<1xf32>
+ scf.yield %7 : vector<1xf32>
}
- builtin.module {
- func.func @simple_reduce() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant dense<0.000000e+00> : vector<1xf32>
- %cst_0 = arith.constant 0.000000e+00 : f32
- %cst_1 = arith.constant dense<3.840000e+02> : vector<1xf32>
- %c32 = arith.constant 32 : index
- %c384 = arith.constant 384 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x384xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128xf32>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %2 = gpu.thread_id x
- %3 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)>()[%2, %workgroup_id_x]
- %4 = scf.for %arg0 = %c0 to %c384 step %c32 iter_args(%arg1 = %cst) -> (vector<1xf32>) {
- %6 = vector.transfer_read %0[%3, %arg0], %cst_0 {in_bounds = [true]} : memref<128x384xf32>, vector<32xf32>
- %7 = vector.broadcast %6 : vector<32xf32> to vector<1x32xf32>
- %8 = vector.multi_reduction <add>, %7, %arg1 [1] : vector<1x32xf32> to vector<1xf32>
- scf.yield %8 : vector<1xf32>
- }
- %5 = arith.divf %4, %cst_1 : vector<1xf32>
- vector.transfer_write %5, %1[%3] {in_bounds = [true]} : vector<1xf32>, memref<128xf32>
- return
- }
- }
+ %4 = arith.divf %3, %cst_1 : vector<1xf32>
+ vector.transfer_write %4, %1[%2] {in_bounds = [true]} : vector<1xf32>, memref<128xf32>
+ return
}
}
-// CHECK-LABEL: func.func @simple_reduce() {
+// CHECK-LABEL: func.func @simple_reduce()
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : i32
@@ -81,51 +70,37 @@
// Make sure memref.load from uniform buffers are hoisted out as uniform code.
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, uniform_buffer>
- ]>
-]>
-hal.executable private @reduce_uniform_buffer_offset {
- hal.executable.variant @cuda target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export @reduce_uniform_buffer_offset layout(#pipeline_layout) attributes {
- workgroup_size = [32 : index, 1 : index, 1 : index]
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
+#map = affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)>
+module {
+ func.func @reduce_uniform_buffer_offset() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation_info} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %cst = arith.constant dense<0.000000e+00> : vector<1xf32>
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %cst_1 = arith.constant dense<3.840000e+02> : vector<1xf32>
+ %c32 = arith.constant 32 : index
+ %c384 = arith.constant 384 : index
+ %0 = hal.interface.binding.subspan set(0) binding(2) type(uniform_buffer) offset(%c0) : memref<1xvector<4xi32>, #hal.descriptor_type<uniform_buffer>>
+ %1 = memref.load %0[%c0] : memref<1xvector<4xi32>, #hal.descriptor_type<uniform_buffer>>
+ %2 = vector.extractelement %1[%c0 : index] : vector<4xi32>
+ %3 = vector.extractelement %1[%c1 : index] : vector<4xi32>
+ %4 = arith.index_castui %2 : i32 to index
+ %5 = arith.index_castui %3 : i32 to index
+ %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) : memref<128x384xf32>
+ %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : memref<128xf32>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %thread_id_x = gpu.thread_id x
+ %8 = affine.apply #map()[%thread_id_x, %workgroup_id_x]
+ %9 = scf.for %arg0 = %c0 to %c384 step %c32 iter_args(%arg1 = %cst) -> (vector<1xf32>) {
+ %11 = vector.transfer_read %6[%8, %arg0], %cst_0 {in_bounds = [true]} : memref<128x384xf32>, vector<32xf32>
+ %12 = vector.broadcast %11 : vector<32xf32> to vector<1x32xf32>
+ %13 = vector.multi_reduction <add>, %12, %arg1 [1] : vector<1x32xf32> to vector<1xf32>
+ scf.yield %13 : vector<1xf32>
}
- builtin.module {
- func.func @reduce_uniform_buffer_offset() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %cv = arith.constant dense<0.000000e+00> : vector<1xf32>
- %f0 = arith.constant 0.000000e+00 : f32
- %fv = arith.constant dense<3.840000e+02> : vector<1xf32>
- %c32 = arith.constant 32 : index
- %c384 = arith.constant 384 : index
-
- %ub = hal.interface.binding.subspan set(0) binding(2) type(uniform_buffer) offset(%c0) : memref<1xvector<4xi32>, #hal.descriptor_type<uniform_buffer>>
- %offsets = memref.load %ub[%c0] : memref<1xvector<4xi32>, #hal.descriptor_type<uniform_buffer>>
- %o0 = vector.extractelement %offsets[%c0 : index] : vector<4xi32>
- %o1 = vector.extractelement %offsets[%c1 : index] : vector<4xi32>
- %offset0 = arith.index_castui %o0 : i32 to index
- %offset1 = arith.index_castui %o1 : i32 to index
-
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%offset0) : memref<128x384xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%offset1) : memref<128xf32>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %2 = gpu.thread_id x
- %3 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)>()[%2, %workgroup_id_x]
- %4 = scf.for %arg0 = %c0 to %c384 step %c32 iter_args(%arg1 = %cv) -> (vector<1xf32>) {
- %6 = vector.transfer_read %0[%3, %arg0], %f0 {in_bounds = [true]} : memref<128x384xf32>, vector<32xf32>
- %7 = vector.broadcast %6 : vector<32xf32> to vector<1x32xf32>
- %8 = vector.multi_reduction <add>, %7, %arg1 [1] : vector<1x32xf32> to vector<1xf32>
- scf.yield %8 : vector<1xf32>
- }
- %5 = arith.divf %4, %fv : vector<1xf32>
- vector.transfer_write %5, %1[%3] {in_bounds = [true]} : vector<1xf32>, memref<128xf32>
- return
- }
- }
+ %10 = arith.divf %9, %cst_1 : vector<1xf32>
+ vector.transfer_write %10, %7[%8] {in_bounds = [true]} : vector<1xf32>, memref<128xf32>
+ return
}
}
@@ -150,51 +125,38 @@
// Make sure memref.load from readonly storage buffers are hoisted out as uniform code.
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @reduce_storage_buffer_offset {
- hal.executable.variant @cuda target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export @reduce_storage_buffer_offset layout(#pipeline_layout) attributes {
- workgroup_size = [32 : index, 1 : index, 1 : index]
- }
- builtin.module {
- func.func @reduce_storage_buffer_offset() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %cv = arith.constant dense<0.000000e+00> : vector<1xf32>
- %f0 = arith.constant 0.000000e+00 : f32
- %fv = arith.constant dense<3.840000e+02> : vector<1xf32>
- %c32 = arith.constant 32 : index
- %c384 = arith.constant 384 : index
- %ub = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1xvector<4xi32>, #hal.descriptor_type<storage_buffer>>
- %offsets = memref.load %ub[%c0] : memref<1xvector<4xi32>, #hal.descriptor_type<storage_buffer>>
- %o0 = vector.extractelement %offsets[%c0 : index] : vector<4xi32>
- %o1 = vector.extractelement %offsets[%c1 : index] : vector<4xi32>
- %offset0 = arith.index_castui %o0 : i32 to index
- %offset1 = arith.index_castui %o1 : i32 to index
-
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%offset0) : memref<128x384xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%offset1) : memref<128xf32>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %2 = gpu.thread_id x
- %3 = affine.apply affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)>()[%2, %workgroup_id_x]
- %4 = scf.for %arg0 = %c0 to %c384 step %c32 iter_args(%arg1 = %cv) -> (vector<1xf32>) {
- %6 = vector.transfer_read %0[%3, %arg0], %f0 {in_bounds = [true]} : memref<128x384xf32>, vector<32xf32>
- %7 = vector.broadcast %6 : vector<32xf32> to vector<1x32xf32>
- %8 = vector.multi_reduction <add>, %7, %arg1 [1] : vector<1x32xf32> to vector<1xf32>
- scf.yield %8 : vector<1xf32>
- }
- %5 = arith.divf %4, %fv : vector<1xf32>
- vector.transfer_write %5, %1[%3] {in_bounds = [true]} : vector<1xf32>, memref<128xf32>
- return
+#map = affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)>
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
+module {
+ func.func @reduce_storage_buffer_offset() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation_info} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %cst = arith.constant dense<0.000000e+00> : vector<1xf32>
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %cst_1 = arith.constant dense<3.840000e+02> : vector<1xf32>
+ %c32 = arith.constant 32 : index
+ %c384 = arith.constant 384 : index
+ %0 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1xvector<4xi32>, #hal.descriptor_type<storage_buffer>>
+ %1 = memref.load %0[%c0] : memref<1xvector<4xi32>, #hal.descriptor_type<storage_buffer>>
+ %2 = vector.extractelement %1[%c0 : index] : vector<4xi32>
+ %3 = vector.extractelement %1[%c1 : index] : vector<4xi32>
+ %4 = arith.index_castui %2 : i32 to index
+ %5 = arith.index_castui %3 : i32 to index
+ %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) : memref<128x384xf32>
+ %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : memref<128xf32>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %thread_id_x = gpu.thread_id x
+ %8 = affine.apply #map()[%thread_id_x, %workgroup_id_x]
+ %9 = scf.for %arg0 = %c0 to %c384 step %c32 iter_args(%arg1 = %cst) -> (vector<1xf32>) {
+ %11 = vector.transfer_read %6[%8, %arg0], %cst_0 {in_bounds = [true]} : memref<128x384xf32>, vector<32xf32>
+ %12 = vector.broadcast %11 : vector<32xf32> to vector<1x32xf32>
+ %13 = vector.multi_reduction <add>, %12, %arg1 [1] : vector<1x32xf32> to vector<1xf32>
+ scf.yield %13 : vector<1xf32>
}
- }
+ %10 = arith.divf %9, %cst_1 : vector<1xf32>
+ vector.transfer_write %10, %7[%8] {in_bounds = [true]} : vector<1xf32>, memref<128xf32>
+ return
}
}
@@ -217,39 +179,27 @@
// -----
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @shared_memory_copy {
- hal.executable.variant @cuda target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export @shared_memory_copy layout(#pipeline_layout) attributes {
- workgroup_size = [32 : index, 1 : index, 1 : index]
- }
- builtin.module {
- func.func @shared_memory_copy() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant dense<0.000000e+00> : vector<1xf32>
- %cst_0 = arith.constant 0.000000e+00 : f32
- %c32 = arith.constant 32 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32xf32>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32, #gpu.address_space<workgroup>>
- %2 = vector.transfer_read %0[%workgroup_id_x, %c0], %cst_0 {in_bounds = [true]} : memref<128x32xf32>, vector<32xf32>
- vector.transfer_write %2, %alloc[%c0] {in_bounds = [true]} : vector<32xf32>, memref<32xf32, #gpu.address_space<workgroup>>
- gpu.barrier
- %3 = vector.transfer_read %alloc[%c0], %cst_0 {in_bounds = [true]} : memref<32xf32, #gpu.address_space<workgroup>>, vector<32xf32>
- vector.transfer_write %3, %1[%workgroup_id_x, %c0] {in_bounds = [true]} : vector<32xf32>, memref<128x32xf32>
- return
- }
- }
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
+module {
+ func.func @shared_memory_copy() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation_info} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant dense<0.000000e+00> : vector<1xf32>
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %c32 = arith.constant 32 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32xf32>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32, #gpu.address_space<workgroup>>
+ %2 = vector.transfer_read %0[%workgroup_id_x, %c0], %cst_0 {in_bounds = [true]} : memref<128x32xf32>, vector<32xf32>
+ vector.transfer_write %2, %alloc[%c0] {in_bounds = [true]} : vector<32xf32>, memref<32xf32, #gpu.address_space<workgroup>>
+ gpu.barrier
+ %3 = vector.transfer_read %alloc[%c0], %cst_0 {in_bounds = [true]} : memref<32xf32, #gpu.address_space<workgroup>>, vector<32xf32>
+ vector.transfer_write %3, %1[%workgroup_id_x, %c0] {in_bounds = [true]} : vector<32xf32>, memref<128x32xf32>
+ return
}
}
-// CHECK-LABEL: func.func @shared_memory_copy() {
+// CHECK-LABEL: func.func @shared_memory_copy()
// CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<32xf32, #gpu.address_space<workgroup>>
// CHECK: vector.transfer_read {{.*}} : memref<128x32xf32>, vector<1xf32>
// CHECK: vector.transfer_write {{.*}} %[[ALLOC]]{{.*}} : vector<1xf32>, memref<32xf32, #gpu.address_space<workgroup>>
@@ -263,53 +213,42 @@
// Check that we multi-row matvec gets distributed across subgroup threads.
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @multirow {
- hal.executable.variant @rocm target(#executable_target_rocm_hsaco_fb) {
- hal.executable.export @multirow layout(#pipeline_layout) attributes {
- workgroup_size = [64 : index, 1 : index, 1 : index]
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [64, 1, 1]>
+#map = affine_map<()[s0] -> (s0 * 4)>
+#map1 = affine_map<(d0, d1) -> (0, d1)>
+module {
+ func.func @multirow() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb, translation_info = #translation_info} {
+ %cst = arith.constant dense<0.000000e+00> : vector<4x512xf16>
+ %c0 = arith.constant 0 : index
+ %cst_0 = arith.constant dense<0.000000e+00> : vector<1x4xf16>
+ %c4096 = arith.constant 4096 : index
+ %c512 = arith.constant 512 : index
+ %cst_1 = arith.constant 0.000000e+00 : f16
+ %thread_id_x = gpu.thread_id x
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>
+ memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>
+ memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
+ memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %3 = affine.apply #map()[%workgroup_id_x]
+ %4 = scf.for %arg0 = %c0 to %c4096 step %c512 iter_args(%arg1 = %cst) -> (vector<4x512xf16>) {
+ %8 = vector.transfer_read %0[%c0, %arg0], %cst_1 {in_bounds = [true, true], permutation_map = #map1} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x512xf16>
+ %9 = vector.transfer_read %1[%3, %arg0], %cst_1 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x512xf16>
+ %10 = arith.mulf %8, %9 : vector<4x512xf16>
+ %11 = arith.addf %arg1, %10 : vector<4x512xf16>
+ scf.yield %11 : vector<4x512xf16>
}
- builtin.module {
- func.func @multirow() {
- %cst = arith.constant dense<0.000000e+00> : vector<4x512xf16>
- %c0 = arith.constant 0 : index
- %cst_0 = arith.constant dense<0.000000e+00> : vector<1x4xf16>
- %c4096 = arith.constant 4096 : index
- %c512 = arith.constant 512 : index
- %cst_1 = arith.constant 0.000000e+00 : f16
- %id = gpu.thread_id x
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>
- memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>
- memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
- memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
- %4 = scf.for %arg0 = %c0 to %c4096 step %c512 iter_args(%arg1 = %cst) -> (vector<4x512xf16>) {
- %8 = vector.transfer_read %0[%c0, %arg0], %cst_1 {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (0, d1)>} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x512xf16>
- %9 = vector.transfer_read %1[%3, %arg0], %cst_1 {in_bounds = [true, true]} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x512xf16>
- %10 = arith.mulf %8, %9 : vector<4x512xf16>
- %11 = arith.addf %arg1, %10 : vector<4x512xf16>
- scf.yield %11 : vector<4x512xf16>
- }
- %5 = vector.broadcast %4 : vector<4x512xf16> to vector<1x4x512xf16>
- %6 = vector.multi_reduction <add>, %5, %cst_0 [2] : vector<1x4x512xf16> to vector<1x4xf16>
- %7 = vector.extract %6[0] : vector<4xf16> from vector<1x4xf16>
- vector.transfer_write %7, %2[%c0, %3] {in_bounds = [true]} : vector<4xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
- return
- }
- }
+ %5 = vector.broadcast %4 : vector<4x512xf16> to vector<1x4x512xf16>
+ %6 = vector.multi_reduction <add>, %5, %cst_0 [2] : vector<1x4x512xf16> to vector<1x4xf16>
+ %7 = vector.extract %6[0] : vector<4xf16> from vector<1x4xf16>
+ vector.transfer_write %7, %2[%c0, %3] {in_bounds = [true]} : vector<4xf16>, memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>
+ return
}
}
-// CHECK-LABEL: func.func @multirow() {
+// CHECK-LABEL: func.func @multirow()
// CHECK: scf.for {{.*}} -> (vector<4x8xf16>) {
// CHECK: vector.transfer_read {{.*}} : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x8xf16>
// CHECK: vector.transfer_read {{.*}} : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>, vector<4x8xf16>
@@ -323,30 +262,17 @@
// CHECK-NEXT: return
// -----
-
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @simple_nd_write {
- hal.executable.variant @cuda target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export @simple_nd_write layout(#pipeline_layout) attributes {
- workgroup_size = [32 : index, 1 : index, 1 : index]
- }
- builtin.module {
- func.func @simple_nd_write() {
- %c0 = arith.constant 0 : index
- %cst_0 = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<4x1024xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<4x1024xf32>
- %6 = vector.transfer_read %0[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<4x1024xf32>, vector<4x1024xf32>
- vector.transfer_write %6, %1[%c0, %c0] {in_bounds = [true, true]} : vector<4x1024xf32>, memref<4x1024xf32>
- return
- }
- }
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [32, 1, 1]>
+module {
+ func.func @simple_nd_write() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation_info} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<4x1024xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<4x1024xf32>
+ %2 = vector.transfer_read %0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<4x1024xf32>, vector<4x1024xf32>
+ vector.transfer_write %2, %1[%c0, %c0] {in_bounds = [true, true]} : vector<4x1024xf32>, memref<4x1024xf32>
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp b/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
index a77b3c0..3283e27 100644
--- a/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp
@@ -159,28 +159,27 @@
}
void EliminateEmptyTensorsPass::runOnOperation() {
- ModuleOp moduleOp = getOperation();
+ auto funcOp = getOperation();
MLIRContext *context = &getContext();
// Run the convert to destination style patterns.
{
RewritePatternSet patterns(context);
linalg::populateConvertToDestinationStylePatterns(patterns);
- if (failed(applyPatternsAndFoldGreedily(moduleOp, std::move(patterns)))) {
- moduleOp->emitOpError(
- "Failed in conversion to destination style patterns");
+ if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
+ funcOp->emitOpError("Failed in conversion to destination style patterns");
return signalPassFailure();
}
}
- IRRewriter rewriter(moduleOp->getContext());
+ IRRewriter rewriter(funcOp->getContext());
auto bufferizationOptions = getBufferizationOptions();
- OneShotAnalysisState state(moduleOp, bufferizationOptions);
+ OneShotAnalysisState state(funcOp, bufferizationOptions);
// Analyze IR.
- if (failed(analyzeOp(moduleOp, state)))
+ if (failed(analyzeOp(funcOp, state)))
return signalPassFailure();
// Eliminate empty tensors.
- if (failed(bufferization::eliminateEmptyTensors(rewriter, moduleOp, state)))
+ if (failed(bufferization::eliminateEmptyTensors(rewriter, funcOp, state)))
return signalPassFailure();
}
@@ -199,14 +198,14 @@
/// Run comprehensive bufferize.
void IREEComprehensiveBufferizePass::runOnOperation() {
- ModuleOp moduleOp = getOperation();
+ auto funcOp = getOperation();
IREEOneShotBufferizationOptions options = getBufferizationOptions();
options.testAnalysisOnly = testAnalysisOnly;
options.printConflicts = printConflicts;
options.allocationFn = allocationFn;
options.memCpyFn = memCpyFn;
- if (failed(runIREEOneShotBufferize(moduleOp, options))) {
+ if (failed(runIREEOneShotBufferize(funcOp, options))) {
return signalPassFailure();
}
@@ -214,17 +213,19 @@
{
RewritePatternSet patterns(&getContext());
linalg::populateEraseUnusedOperandsAndResultsPatterns(patterns);
- if (failed(applyPatternsAndFoldGreedily(moduleOp, std::move(patterns)))) {
+ if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
return signalPassFailure();
}
}
}
-std::unique_ptr<OperationPass<ModuleOp>> createEliminateEmptyTensorsPass() {
+std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+createEliminateEmptyTensorsPass() {
return std::make_unique<EliminateEmptyTensorsPass>();
}
-std::unique_ptr<OperationPass<ModuleOp>> createIREEComprehensiveBufferizePass(
+std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+createIREEComprehensiveBufferizePass(
std::optional<BufferizationOptions::AllocationFn> allocationFn,
std::optional<BufferizationOptions::MemCpyFn> memCpyFn) {
if (!allocationFn)
@@ -235,26 +236,26 @@
memCpyFn);
}
-void addIREEPostBufferizationPasses(OpPassManager &passManager) {
- passManager.addPass(memref::createResolveShapedTypeResultDimsPass());
- passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- passManager.addNestedPass<func::FuncOp>(createCSEPass());
+void addIREEPostBufferizationPasses(OpPassManager &funcPassManager) {
+ funcPassManager.addPass(memref::createResolveShapedTypeResultDimsPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// There are redundant memcpy (with linalg.generic form) ops created, which
// can be deleted by canonicalizer. We have to run it again because the
// memrefs are unified in CSE pass, so we can truely remove redundant memcpy.
- passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- passManager.addNestedPass<func::FuncOp>(createCleanupBufferAllocViewPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCleanupBufferAllocViewPass());
}
void addIREEComprehensiveBufferizePasses(
- OpPassManager &passManager,
+ OpPassManager &funcPassManager,
std::optional<BufferizationOptions::AllocationFn> allocationFn,
std::optional<BufferizationOptions::MemCpyFn> memCpyFn) {
- passManager.addPass(createEliminateEmptyTensorsPass());
- passManager.addPass(bufferization::createEmptyTensorToAllocTensorPass());
- passManager.addPass(
+ funcPassManager.addPass(createEliminateEmptyTensorsPass());
+ funcPassManager.addPass(bufferization::createEmptyTensorToAllocTensorPass());
+ funcPassManager.addPass(
createIREEComprehensiveBufferizePass(allocationFn, memCpyFn));
- addIREEPostBufferizationPasses(passManager);
+ addIREEPostBufferizationPasses(funcPassManager);
}
} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/LowerExecutableUsingTransformDialect.cpp b/compiler/src/iree/compiler/Codegen/Common/LowerExecutableUsingTransformDialect.cpp
new file mode 100644
index 0000000..592ddb8
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/LowerExecutableUsingTransformDialect.cpp
@@ -0,0 +1,71 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree-dialects/Dialect/LinalgTransform/Passes.h"
+#include "iree/compiler/Codegen/Common/PassDetail.h"
+#include "iree/compiler/Codegen/Common/Passes.h"
+
+namespace mlir::iree_compiler {
+
+namespace {
+class LowerExecutableUsingTransformDialectPass
+ : public LowerExecutableUsingTransformDialectBase<
+ LowerExecutableUsingTransformDialectPass> {
+public:
+ void runOnOperation() override;
+};
+} // namespace
+
+void LowerExecutableUsingTransformDialectPass::runOnOperation() {
+ auto moduleOp = getOperation();
+ auto funcOps = moduleOp.getOps<FunctionOpInterface>();
+
+ if (funcOps.empty() || !llvm::hasSingleElement(funcOps)) {
+ // Can only handle dispatches with single functions on the transform dialect
+ // path.
+ return;
+ }
+
+ auto funcOp = *funcOps.begin();
+ IREE::Codegen::TranslationInfoAttr translationInfo =
+ getTranslationInfo(funcOp);
+ if (!translationInfo || translationInfo.getDispatchLoweringPassPipeline() !=
+ IREE::Codegen::DispatchLoweringPassPipeline::
+ TransformDialectCodegen) {
+ return;
+ }
+
+ // Run the interpreter and drop schedule passes.
+ SymbolRefAttr codegenSpec = translationInfo.getCodegenSpec();
+ StringRef entryPoint =
+ codegenSpec ? codegenSpec.getLeafReference() : StringRef("");
+ OpPassManager modulePassManager(ModuleOp::getOperationName());
+ modulePassManager.addPass(
+ iree_compiler::createTransformDialectInterpreterPass(entryPoint));
+ modulePassManager.addPass(createDropSchedulePass());
+ if (failed(runPipeline(modulePassManager, moduleOp))) {
+ moduleOp.emitOpError("failed to run transform dialect passes");
+ return signalPassFailure();
+ }
+
+ // Make sure that the translation info is set to `None` to avoid using
+ // other pass pipelines.
+ auto translationInfoModified = getTranslationInfo(funcOp);
+ if (!translationInfoModified ||
+ translationInfoModified.getDispatchLoweringPassPipeline() !=
+ IREE::Codegen::DispatchLoweringPassPipeline::None) {
+ funcOp->emitOpError("expected transform dialect lowering to set the "
+ "translation_info to use None");
+ return signalPassFailure();
+ }
+}
+
+std::unique_ptr<OperationPass<ModuleOp>>
+createLowerExecutableUsingTransformDialectPass() {
+ return std::make_unique<LowerExecutableUsingTransformDialectPass>();
+}
+
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
index 1a918fe..e75b8f1 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
@@ -73,95 +73,86 @@
}
void runOnOperation() override {
- IREE::HAL::ExecutableVariantOp variantOp = getOperation();
- ModuleOp moduleOp = variantOp.getInnerModule();
- llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps =
- getAllEntryPoints(moduleOp);
- MLIRContext *context = moduleOp.getContext();
+ auto moduleOp = getOperation();
+ MLIRContext *context = &getContext();
+ for (auto funcOp : moduleOp.getOps<FunctionOpInterface>()) {
- // Parse the file path and kernel config strategy from flags. There are
- // two possible usage flows for transform dialect libraries.
- // 1. Use `__kernel_config` to match and annotate variants with the
- // strategy to use. This could either be a transform dialect strategy
- // or any other IREE codegen pipeline.
- //
- // 2. Use the configuration strategy to do codegen directly. At the end of
- // the strategy, the variant needs to be annotated with
- // "translation_info" = #iree_codegen.translation_info<None>
- SmallVector<StringRef, 2> parts;
- llvm::SplitString(llvm::StringRef(clCodegenTransformDialectLibraryFileName),
- parts, "@");
- if (parts.size() > 2) {
- variantOp.emitError()
- << "Invalid transform library path and sequence name "
- << clCodegenTransformDialectLibraryFileName;
- return signalPassFailure();
- }
- bool hasTransformLibrary = !parts.empty();
-
- std::string libraryFileName;
- if (hasTransformLibrary) {
- if (parts[0].empty()) {
- variantOp.emitError() << "Cannot specify an empty library path";
+ // Parse the file path and kernel config strategy from flags. There are
+ // two possible usage flows for transform dialect libraries.
+ // 1. Use `__kernel_config` to match and annotate variants with the
+ // strategy to use. This could either be a transform dialect strategy
+ // or any other IREE codegen pipeline.
+ //
+ // 2. Use the configuration strategy to do codegen directly. At the end
+ // of
+ // the strategy, the variant needs to be annotated with
+ // "translation_info" = #iree_codegen.translation_info<None>
+ SmallVector<StringRef, 2> parts;
+ llvm::SplitString(
+ llvm::StringRef(clCodegenTransformDialectLibraryFileName), parts,
+ "@");
+ if (parts.size() > 2) {
+ funcOp.emitError()
+ << "Invalid transform library path and sequence name "
+ << clCodegenTransformDialectLibraryFileName;
return signalPassFailure();
}
- libraryFileName = parts[0];
- }
+ bool hasTransformLibrary = !parts.empty();
- std::string entrySequenceName;
- // Check if the user specified a custom entry point name.
- if (parts.size() == 2) {
- if (parts[1].empty()) {
- variantOp.emitError() << "Cannot specify an empty sequence name";
- return signalPassFailure();
+ std::string libraryFileName;
+ if (hasTransformLibrary) {
+ if (parts[0].empty()) {
+ funcOp.emitError() << "Cannot specify an empty library path";
+ return signalPassFailure();
+ }
+ libraryFileName = parts[0];
}
- entrySequenceName = parts[1];
- } else {
- entrySequenceName = "__kernel_config";
- }
- LDBG("MaterializeUserConfigsPass on variant: " << variantOp);
- std::optional<ModuleOp> transformLibrary = std::nullopt;
- if (hasTransformLibrary) {
- auto dialect =
- context->getOrLoadDialect<IREE::Codegen::IREECodegenDialect>();
- auto maybeTransformLibrary =
- dialect->getOrLoadTransformLibraryModule(libraryFileName);
- if (failed(maybeTransformLibrary)) {
- variantOp.emitError()
- << "failed to load transform library module: " << libraryFileName;
- return signalPassFailure();
+ std::string entrySequenceName;
+ // Check if the user specified a custom entry point name.
+ if (parts.size() == 2) {
+ if (parts[1].empty()) {
+ funcOp.emitError() << "Cannot specify an empty sequence name";
+ return signalPassFailure();
+ }
+ entrySequenceName = parts[1];
+ } else {
+ entrySequenceName = "__kernel_config";
}
- transformLibrary = *maybeTransformLibrary;
- LDBG("--found transform library @" << libraryFileName);
- auto runResult = runTransformConfigurationStrategy(
- variantOp, entrySequenceName, *transformLibrary);
- if (runResult == StrategyRunResult::NotFound) {
- variantOp.emitError() << "transform kernel config strategy `"
- << entrySequenceName << " not found";
- return signalPassFailure();
- } else if (runResult == StrategyRunResult::Failed) {
- variantOp.emitError() << "transform kernel config strategy `"
- << entrySequenceName << "` failed to apply";
- return signalPassFailure();
- }
- }
+ LDBG("MaterializeUserConfigsPass on function: " << funcOp);
+ std::optional<ModuleOp> transformLibrary = std::nullopt;
+ if (hasTransformLibrary) {
+ auto dialect =
+ context->getOrLoadDialect<IREE::Codegen::IREECodegenDialect>();
+ auto maybeTransformLibrary =
+ dialect->getOrLoadTransformLibraryModule(libraryFileName);
+ if (failed(maybeTransformLibrary)) {
+ funcOp.emitError()
+ << "failed to load transform library module: " << libraryFileName;
+ return signalPassFailure();
+ }
+ transformLibrary = *maybeTransformLibrary;
+ LDBG("--found transform library @" << libraryFileName);
- LDBG("--start iterating over: "
- << std::distance(moduleOp.getOps<mlir::FunctionOpInterface>().begin(),
- moduleOp.getOps<mlir::FunctionOpInterface>().end())
- << " functions");
- std::optional<IREE::Codegen::TranslationInfoAttr> translationInfo;
- for (auto funcOp : moduleOp.getOps<mlir::FunctionOpInterface>()) {
- auto exportOp = exportOps.lookup(funcOp.getName());
- if (!exportOp) {
- continue;
+ auto runResult = runTransformConfigurationStrategy(
+ funcOp, entrySequenceName, *transformLibrary);
+ if (runResult == StrategyRunResult::NotFound) {
+ funcOp.emitError() << "transform kernel config strategy `"
+ << entrySequenceName << " not found";
+ return signalPassFailure();
+ } else if (runResult == StrategyRunResult::Failed) {
+ funcOp.emitError() << "transform kernel config strategy `"
+ << entrySequenceName << "` failed to apply";
+ return signalPassFailure();
+ }
}
/// Nothing to do if the export already has a config.
- if (getTranslationInfo(exportOp)) {
- continue;
+ IREE::Codegen::TranslationInfoAttr translationInfo =
+ getTranslationInfo(funcOp);
+ if (translationInfo) {
+ return;
}
/// First, apply all user configs.
@@ -175,52 +166,35 @@
});
if (res.wasInterrupted()) {
- moduleOp.emitOpError("error in setting user configuration");
+ funcOp.emitOpError("error in setting user configuration");
return signalPassFailure();
}
- }
- LDBG("--guaranteed unique translationInfo: " << translationInfo);
- /// We only need to resolve symbols for transform dialect based strategies.
- if (!translationInfo ||
- translationInfo.value().getDispatchLoweringPassPipeline() !=
- IREE::Codegen::DispatchLoweringPassPipeline::
- TransformDialectCodegen) {
- return;
- }
-
- // From now on, we know we have a transform dialect strategy. We now need to
- // ensure it can resolve and apply in a subsequent interpreter pass or else
- // we need to fall back to codegen.
- bool failedToResolve = false;
- auto g = llvm::make_scope_exit([&]() {
- if (!failedToResolve)
+ translationInfo = getTranslationInfo(funcOp);
+ LDBG("--guaranteed unique translationInfo: " << translationInfo);
+ /// We only need to resolve symbols for transform dialect based
+ /// strategies.
+ if (!translationInfo ||
+ translationInfo.getDispatchLoweringPassPipeline() !=
+ IREE::Codegen::DispatchLoweringPassPipeline::
+ TransformDialectCodegen) {
return;
-
- exportOps = getAllEntryPoints(variantOp.getInnerModule());
- for (auto &it : exportOps) {
- auto exportOp = it.second;
- if (getTranslationInfo(exportOp) == translationInfo) {
- exportOp->removeAttr(kTranslationInfoAttrName);
- }
}
- });
- std::optional<SymbolRefAttr> strategyName =
- translationInfo.value().getCodegenSpec();
- if (!strategyName || *strategyName == SymbolRefAttr()) {
- failedToResolve = true;
- return;
- }
+ std::optional<SymbolRefAttr> strategyName =
+ translationInfo.getCodegenSpec();
+ if (!strategyName || *strategyName == SymbolRefAttr()) {
+ return;
+ }
- /// If we have a symbol, verify the existence of the symbol within the
- /// transform library.
- StringRef entryPoint = strategyName->getLeafReference();
- if (!transformLibrary || !(*transformLibrary) ||
- !transform::detail::findTransformEntryPoint(
- variantOp, *transformLibrary, entryPoint)) {
- moduleOp.emitOpError("failed to find transform strategy symbol");
- failedToResolve = true;
+ /// If we have a symbol, verify the existence of the symbol within the
+ /// transform library.
+ StringRef entryPoint = strategyName->getLeafReference();
+ if (!transformLibrary || !(*transformLibrary) ||
+ !transform::detail::findTransformEntryPoint(funcOp, *transformLibrary,
+ entryPoint)) {
+ funcOp.emitOpError("failed to find transform strategy symbol");
+ }
}
}
@@ -231,8 +205,7 @@
} // namespace
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
-createMaterializeUserConfigsPass() {
+std::unique_ptr<OperationPass<ModuleOp>> createMaterializeUserConfigsPass() {
return std::make_unique<MaterializeUserConfigsPass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/PassUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/PassUtils.cpp
new file mode 100644
index 0000000..b0c8e27
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/PassUtils.cpp
@@ -0,0 +1,23 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Common/PassUtils.h"
+#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+
+namespace mlir::iree_compiler {
+
+std::optional<OpPassManager>
+getFunctionOpInterfacePassManager(FunctionOpInterface interfaceOp) {
+ return TypeSwitch<Operation *, std::optional<OpPassManager>>(
+ interfaceOp.getOperation())
+ .Case<func::FuncOp, IREE::Util::FuncOp>(
+ [&](auto funcOp) { return OpPassManager(funcOp.getOperationName()); })
+ .Default([&](Operation *op) { return std::nullopt; });
+}
+
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/PassUtils.h b/compiler/src/iree/compiler/Codegen/Common/PassUtils.h
new file mode 100644
index 0000000..b81709a
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/PassUtils.h
@@ -0,0 +1,27 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_CODEGEN_COMMON_PASSUTILS_H_
+#define IREE_COMPILER_CODEGEN_COMMON_PASSUTILS_H_
+
+#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
+#include "iree/compiler/Utils/PassUtils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Pass/PassManager.h"
+
+namespace mlir::iree_compiler {
+
+/// Pass manager nesting for `FunctionOpInterface` ops.
+using FunctionLikeNest = MultiOpNest<func::FuncOp>;
+
+/// Helper method to get a pass manager nested at `FunctionOpInterface`.
+std::optional<OpPassManager>
+getFunctionOpInterfacePassManager(FunctionOpInterface funcOp);
+
+} // namespace mlir::iree_compiler
+
+#endif // IREE_COMPILER_CODEGEN_COMMON_PASSUTILS_H_
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.cpp b/compiler/src/iree/compiler/Codegen/Common/Passes.cpp
index ca5587c..8fac10a 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.cpp
@@ -5,20 +5,20 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/Common/PassUtils.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Pass/PassManager.h"
namespace mlir::iree_compiler {
void addCommonTargetExecutablePreprocessingPasses(
- OpPassManager &passManager, bool useDecomposeSoftmaxFusion) {
- OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(createTypePropagationPass());
- nestedModulePM.addPass(createBubbleUpOrdinalOpsPass());
- nestedModulePM.addPass(createBufferizeCopyOnlyDispatchesPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createDecomposeSoftmaxPass(useDecomposeSoftmaxFusion));
- passManager.addPass(createMaterializeUserConfigsPass());
+ FunctionLikeNest &funcPassManager, bool useDecomposeSoftmaxFusion) {
+ funcPassManager.addPass(createTypePropagationPass)
+ .addPass(createBubbleUpOrdinalOpsPass)
+ .addPass(createBufferizeCopyOnlyDispatchesPass)
+ .addPass([&]() {
+ return createDecomposeSoftmaxPass(useDecomposeSoftmaxFusion);
+ });
}
//===---------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.h b/compiler/src/iree/compiler/Codegen/Common/Passes.h
index 48cfad4..ce676b3 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.h
@@ -14,6 +14,7 @@
#include <limits>
+#include "iree/compiler/Codegen/Common/PassUtils.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -29,16 +30,16 @@
/// Passes that are done on all backends before target-specific code-generation
/// kicks in.
void addCommonTargetExecutablePreprocessingPasses(
- OpPassManager &passManager, bool useDecomposeSoftmaxFusion = true);
+ FunctionLikeNest &funcPassManager, bool useDecomposeSoftmaxFusion = true);
/// Post-bufferization passes run to cleanup the IR
/// (ResolveShapedTypeResultDims, Canonicalization/CSE and
/// CleanupBufferAllocView).
-void addIREEPostBufferizationPasses(OpPassManager &passManager);
+void addIREEPostBufferizationPasses(OpPassManager &funcPassManager);
using bufferization::BufferizationOptions;
void addIREEComprehensiveBufferizePasses(
- OpPassManager &passManager,
+ OpPassManager &funcPassManager,
std::optional<BufferizationOptions::AllocationFn> allocationFn =
std::nullopt,
std::optional<BufferizationOptions::MemCpyFn> memCpyFn = std::nullopt);
@@ -51,34 +52,34 @@
/// Pass to perform canonicalizations/cleanups related to HAL interface/buffer
/// allocations and view operations.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createCleanupBufferAllocViewPass();
/// Pass to bufferize dispatches that are copying from one interface to
/// another. This will create a `linalg.generic` op which is a copy that can
/// then be used by backends to handle appropriately.
-std::unique_ptr<OperationPass<ModuleOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createBufferizeCopyOnlyDispatchesPass();
/// Pass to perform canonicalizations/cleanups related to HAL interface/buffer
/// allocations and view operations.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createCleanupBufferAllocViewPass();
/// Concretizes tensor.pad op's result shape if its source op implements
/// OffsetSizeAndStrideOpInterface. For example, pad(extract_slice).
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createConcretizePadResultShapePass();
/// Convert BF16 buffer ops and conversions to simulated behavior with uint16.
-std::unique_ptr<OperationPass<ModuleOp>> createConvertBf16ToUInt16BuffersPass();
+std::unique_ptr<OperationPass<>> createConvertBf16ArithToF32Pass();
/// Convert BF16 buffer ops and conversions to simulated behavior with uint16.
-std::unique_ptr<OperationPass<ModuleOp>> createConvertBf16ArithToF32Pass();
+std::unique_ptr<OperationPass<>> createConvertBf16ToUInt16BuffersPass();
/// Converts entry point function within dispatch regions to use
/// destination-passing style, which is better suited for the upstream
/// comprehensive bufferization pass.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createConvertToDestinationPassingStylePass(
bool useWARForCooperativeMatrixCodegen = false);
@@ -87,7 +88,7 @@
std::unique_ptr<Pass> createDecomposeAffineOpsPass();
// Decomposes batch mmt4d op into mmt4d by tiling the batch dim to 1.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createDecomposeBatchMmt4DOpsPass();
// Decomposes high-D convolution ops into low-D ones.
@@ -99,7 +100,7 @@
/// Creates a pass to decompose tensor.pack and tensor.unpack ops. The pass does
/// tiling and generalization. See implementation for more details.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createDecomposePackUnPackOpsPass(bool tileOuterToOne = false);
/// Creates a pass to convert the softmax op into a sequence of linalg generic
@@ -108,14 +109,15 @@
/// A pass to eliminate tensor.empty ops that could turn into allocations
/// during bufferization.
-std::unique_ptr<OperationPass<ModuleOp>> createEliminateEmptyTensorsPass();
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
+createEliminateEmptyTensorsPass();
/// A pass to emulate memref load operations that use narrow integer types
/// with equivalent operations on supported wide integer types.
-std::unique_ptr<OperationPass<ModuleOp>> createEmulateNarrowTypePass();
+std::unique_ptr<OperationPass<>> createEmulateNarrowTypePass();
/// Creates a pass to erase dead alloc ops where all uses are just store ops.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createEraseDeadAllocAndStoresPass();
std::unique_ptr<Pass> createEraseHALDescriptorTypeFromMemRefPass();
@@ -135,7 +137,7 @@
std::unique_ptr<OperationPass<ModuleOp>> createFlattenMemRefSubspanPass();
/// Creates a pass to fold `affine.min` ops in tiled and distributed loops.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createFoldAffineMinInDistributedLoopsPass();
/// After running the upstream TensorConstantBufferize pass, remove
@@ -146,14 +148,14 @@
/// An ad-hoc pass to canonicalize selected loop carried dependencies on
/// scf.for.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createForOpCanonicalizationPass();
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createMaterializeEncodingIntoNopPass();
/// Fuses tensor.pad ops into their consumer ops' tiled loop nests.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createFuseTensorPadWithConsumerPass();
struct GenericVectorizationPassOptions {
@@ -175,18 +177,18 @@
int64_t maxVectorSize = std::numeric_limits<int64_t>::max();
};
/// Creates a pass to perform vectorization on LinAlg and tensor ops.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createGenericVectorizationPass();
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createGenericVectorizationPass(const GenericVectorizationPassOptions &options);
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createOptimizeTensorInsertExtractSlicesPass();
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createHoistStaticallyBoundAllocationsPass();
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createHoistUnrolledVectorExtractInsertSlicePass();
/// Pass to perform linalg on tensor bufferization. The function passed into
@@ -197,7 +199,8 @@
/// is specified, the default allocator generates an `std.alloc` instruction
/// with the allocated MemRefType having no stride map (i.e. default row-major
/// striding) and default memory space.
-std::unique_ptr<OperationPass<ModuleOp>> createIREEComprehensiveBufferizePass(
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
+createIREEComprehensiveBufferizePass(
std::optional<BufferizationOptions::AllocationFn> allocationFn =
std::nullopt,
std::optional<BufferizationOptions::MemCpyFn> memCpyFn = std::nullopt);
@@ -206,49 +209,57 @@
std::unique_ptr<Pass> createIREEExpandStridedMetadataPass();
/// Instruments memory reads and writes for address tracking.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createInstrumentMemoryAccessesPass();
+/// Pass to lower an executable using transform dialect sequence provided in the
+/// module
+std::unique_ptr<OperationPass<ModuleOp>>
+createLowerExecutableUsingTransformDialectPass();
+
/// Pass to lower ukernel operations into their defined function calls.
std::unique_ptr<OperationPass<ModuleOp>> createLowerUKernelOpsToCallsPass();
/// Creates a pass to convert memref.copy to linalg op.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createMemrefCopyToLinalgPass();
/// Extracts lowering configs and translation info from user configs.
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
-createMaterializeUserConfigsPass();
+std::unique_ptr<OperationPass<ModuleOp>> createMaterializeUserConfigsPass();
/// Pass to optimize vector transfer_read and transfer_write.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createOptimizeVectorTransferPass(bool flatten = false,
bool dropUnitDims = true);
/// Pad dynamic alloc op to convert them into static one.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createPadDynamicAlloc();
+std::unique_ptr<InterfacePass<FunctionOpInterface>> createPadDynamicAlloc();
/// Pass to convert math operations to their polynomial approximation.
std::unique_ptr<OperationPass<>> createPolynomialApproximationPass();
+/// Pass to reconcile TranslationInfo across multiple functions in a dispatch
+/// and set the appropriate values on the surrounding HAL ops.
+std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+createReconcileTranslationInfoPass();
+
/// Pass to fuse parallel linalg operations.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createRematerializeParallelOpsPass();
/// Creates a pass to remove single iteration distributed loops.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createRemoveSingleIterationLoopPass();
/// Create a pass that replaces maximumf/minimumf with minumf/maxnumf ops.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createReplaceSlowMinMaxOpsPass();
/// Pass to optimize vector transfer_read and transfer_write. See Passes.td for
/// `option` details.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSplitFullPartialTransferPass();
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSplitFullPartialTransferPass(StringRef option);
/// Tests iree-hal-preprocess-executables-with behavior.
@@ -259,7 +270,7 @@
createTestPartitionableLoopsInterfacePass();
/// Pass to tile and distribute to workgroups.
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createTileAndDistributeToWorkgroupsPass(
int32_t maxWorkgroupParallelDims = kNumMaxParallelDims,
linalg::DistributionMethod distributionMethod =
@@ -271,13 +282,11 @@
createTransformDialectInterpreterPass(StringRef transformSequenceName = "");
/// Pass to propagate type to avoid generating load/stores of illegal types.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createTypePropagationPass();
+std::unique_ptr<InterfacePass<FunctionOpInterface>> createTypePropagationPass();
/// Creates a pass to vectorize a very specific form of tensor.pad ops with
/// control flows.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createVectorizePadPass();
+std::unique_ptr<InterfacePass<FunctionOpInterface>> createVectorizePadPass();
/// Populates patterns with patterns to concretize tensor.pad op's result
/// shape. `numWorkgroups`, if not empty, will be used as bounds for simplifying
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
index bdbd4b9..968d300 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -26,7 +26,7 @@
}
def BufferizeCopyOnlyDispatches :
- Pass<"iree-codegen-bufferize-copy-only-dispatches", "ModuleOp"> {
+ InterfacePass<"iree-codegen-bufferize-copy-only-dispatches", "mlir::FunctionOpInterface"> {
let summary =
"Bufferize dispatches that copy to/from interfaces to convert to a linalg.copy op";
let constructor = "mlir::iree_compiler::createBufferizeCopyOnlyDispatchesPass()";
@@ -47,13 +47,13 @@
let constructor = "mlir::iree_compiler::createConcretizePadResultShapePass()";
}
-def ConvertBf16ArithToF32 : Pass<"iree-convert-bf16-arith-to-f32", "ModuleOp"> {
+def ConvertBf16ArithToF32 : Pass<"iree-convert-bf16-arith-to-f32", ""> {
let summary = "Convert bf16 arithmetic operations to f32";
let constructor = "mlir::iree_compiler::createConvertBf16ArithToF32Pass()";
}
def ConvertBf16ToUInt16Buffers :
- Pass<"iree-convert-bf16-to-uint16-buffers", "ModuleOp"> {
+ Pass<"iree-convert-bf16-to-uint16-buffers", ""> {
let summary = "Convert bf16 buffers to uint16 equivalents";
let constructor = "mlir::iree_compiler::createConvertBf16ToUInt16BuffersPass()";
}
@@ -170,6 +170,15 @@
];
}
+def ReconcileTranslationInfo
+ : Pass<"iree-codegen-reconcile-translation-info", "IREE::HAL::ExecutableVariantOp"> {
+ let summary =
+ "Reconcile information (like workgroup_size, subgroup_size) across "
+ "`TranslationInfo` set on each function in the dispatch and merge them"
+ "and set them at the appropriate places in the surrounding HAL ops";
+ let constructor = "mlir::iree_compiler::createReconcileTranslationInfoPass()";
+}
+
def ReplaceSlowMinMaxOps
: InterfacePass<"iree-codegen-replace-slow-min-max-ops", "mlir::FunctionOpInterface"> {
let summary =
@@ -179,13 +188,13 @@
}
def EliminateEmptyTensors :
- Pass<"iree-eliminate-empty-tensors", "ModuleOp"> {
+ InterfacePass<"iree-eliminate-empty-tensors", "mlir::FunctionOpInterface"> {
let summary = "Eliminate tensor.empty ops to avoid buffer allocations";
let constructor = "mlir::iree_compiler::createEliminateEmptyTensorsPass()";
}
def EmulateNarrowType :
- Pass<"iree-codegen-emulate-narrow-type", "ModuleOp"> {
+ Pass<"iree-codegen-emulate-narrow-type", ""> {
let summary = "Emulate narrow integer operations using wide integer operations";
let constructor = "mlir::iree_compiler::createEmulateNarrowTypePass()";
}
@@ -234,8 +243,7 @@
];
}
-def FlattenMemRefSubspan :
- Pass<"iree-codegen-flatten-memref-subspan", "ModuleOp"> {
+def FlattenMemRefSubspan : Pass<"iree-codegen-flatten-memref-subspan", "ModuleOp"> {
let summary =
"Flatten n-D MemRef subspan ops to 1-D ones and fold byte offsets";
let constructor = "mlir::iree_compiler::createFlattenMemRefSubspanPass()";
@@ -328,7 +336,7 @@
}
def IREEComprehensiveBufferize :
- Pass<"iree-codegen-iree-comprehensive-bufferize", "ModuleOp"> {
+ InterfacePass<"iree-codegen-iree-comprehensive-bufferize", "mlir::FunctionOpInterface"> {
let summary = "Convert from to Linalg ops on tensors to buffers";
let constructor = "mlir::iree_compiler::createIREEComprehensiveBufferizePass()";
let options = [
@@ -357,6 +365,12 @@
let constructor = "mlir::iree_compiler::createInstrumentMemoryAccessesPass()";
}
+def LowerExecutableUsingTransformDialect :
+ Pass<"iree-codegen-lower-executable-using-transform-dialect", "ModuleOp"> {
+ let summary = "Lower executables using the transform dialect recipe provided in the module.";
+ let constructor = "mlir::iree_compiler::createLowerExecutableUsingTransformDialectPass()";
+}
+
def LowerUKernelOpsToCalls :
Pass<"iree-codegen-lower-ukernel-ops-to-calls", "ModuleOp"> {
let summary = "Lower micro-kernel wrapper ops into function calls";
@@ -369,8 +383,7 @@
let constructor = "mlir::iree_compiler::createMaterializeEncodingIntoNopPass()";
}
-def MaterializeUserConfigs :
- Pass<"iree-codegen-materialize-user-configs", "IREE::HAL::ExecutableVariantOp"> {
+def MaterializeUserConfigs : Pass<"iree-codegen-materialize-user-configs", "ModuleOp"> {
let summary = "Sets the lowering configs and translation info from user configs";
let constructor = "mlir::iree_compiler::createMaterializeUserConfigsPass()";
let dependentDialects = [
@@ -465,7 +478,7 @@
}
def TileAndDistributeToWorkgroups :
- Pass<"iree-codegen-tile-and-distribute-to-workgroups", "IREE::HAL::ExecutableVariantOp"> {
+ InterfacePass<"iree-codegen-tile-and-distribute-to-workgroups", "mlir::FunctionOpInterface"> {
let summary = "Tile and distribute operations to workgroups";
let constructor = "mlir::iree_compiler::createTileAndDistributeToWorkgroupsPass()";
let options = [
diff --git a/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp b/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp
new file mode 100644
index 0000000..04cf6b2
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/ReconcileTranslationInfo.cpp
@@ -0,0 +1,155 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+//=== ReconcileTranslationInfo.cpp ---------------------------------------===//
+//
+// While lowering executable target, the pipelines used are run at a
+// func-like op granularity. Each of these func-like operations set the
+// workgroup size, and subgroup size as required (as part of the
+// `TranslationInfo`). Eventually these have to be reconciled and set
+// appropriately on the surrounding HAL ops for the host runtime to pick them
+// up. In case of inconsistencies, this pass will throw an error.
+//===---------------------------------------------------------------------===//
+
+#include "iree/compiler/Codegen/Common/PassDetail.h"
+#include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+
+namespace mlir::iree_compiler {
+
+namespace {
+
+class ReconcileTranslationInfoPass
+ : public ReconcileTranslationInfoBase<ReconcileTranslationInfoPass> {
+public:
+ void runOnOperation() override;
+};
+} // namespace
+
+// Reconcile workgroup sizes across all translation infos.
+static FailureOr<SmallVector<int64_t>> reconcileWorkgroupSize(
+ ArrayRef<IREE::Codegen::TranslationInfoAttr> translationInfos) {
+ if (translationInfos.empty()) {
+ return SmallVector<int64_t>{};
+ }
+ SmallVector<int64_t> reconciledWorkgroupSize =
+ llvm::to_vector(translationInfos.front().getWorkgroupSize());
+ for (auto translationInfo : translationInfos.drop_front()) {
+ auto workGroupSize = llvm::to_vector(translationInfo.getWorkgroupSize());
+ if (workGroupSize != reconciledWorkgroupSize) {
+ return failure();
+ }
+ }
+ return reconciledWorkgroupSize;
+}
+
+// Reconcile subgroup size across all translation infos.
+static FailureOr<int64_t> reconcileSubgroupSize(
+ ArrayRef<IREE::Codegen::TranslationInfoAttr> translationInfos) {
+ if (translationInfos.empty()) {
+ return int64_t();
+ }
+ int64_t subgroupSize = translationInfos.front().getSubgroupSize();
+ for (auto translationInfo : translationInfos.drop_front()) {
+ if (subgroupSize != translationInfo.getSubgroupSize()) {
+ return failure();
+ }
+ }
+ return subgroupSize;
+}
+
+/// Helper function to retrieve the waves-per-eu value from translation info.
+static std::optional<int64_t>
+getWavesPerEu(IREE::Codegen::TranslationInfoAttr translationInfo) {
+ auto translationConfig = translationInfo.getConfiguration();
+ if (!translationConfig) {
+ return std::nullopt;
+ }
+ auto attr = translationConfig.getAs<IntegerAttr>("waves_per_eu");
+ if (!attr) {
+ return std::nullopt;
+ }
+ return attr.getValue().getSExtValue();
+}
+
+void ReconcileTranslationInfoPass::runOnOperation() {
+ auto variantOp = getOperation();
+ auto innerModuleOp = variantOp.getInnerModule();
+
+ auto exportOps = variantOp.getOps<IREE::HAL::ExecutableExportOp>();
+ if (!llvm::hasSingleElement(exportOps)) {
+ variantOp.emitOpError("reconciliation for multiple export ops unsupported");
+ return signalPassFailure();
+ }
+ auto exportOp = *exportOps.begin();
+ MLIRContext *context = &getContext();
+ Builder builder(&getContext());
+
+ SmallVector<IREE::Codegen::TranslationInfoAttr> translationInfos;
+ innerModuleOp->walk([&](FunctionOpInterface funcOp) {
+ auto translationInfo = getTranslationInfo(funcOp);
+ if (!translationInfo) {
+ return;
+ }
+
+ translationInfos.push_back(translationInfo);
+ // The following is moving the waves-per-eu specification from
+ // translation info into the func-like op. This is not the best
+ // place to do this, but the intent is after this pass all the
+ // lowering configs and translation infos will be deleted.
+ std::optional<int64_t> wavesPerEu = getWavesPerEu(translationInfo);
+ if (wavesPerEu) {
+ funcOp->setAttr("waves_per_eu", IntegerAttr::get(IndexType::get(context),
+ wavesPerEu.value()));
+ }
+ });
+
+ // Reconcile workgroup sizes.
+ FailureOr<SmallVector<int64_t>> reconciledWorkgroupSize =
+ reconcileWorkgroupSize(translationInfos);
+ if (failed(reconciledWorkgroupSize)) {
+ exportOp.emitOpError("failed to reconcile workgroup sizes");
+ return signalPassFailure();
+ }
+ if (reconciledWorkgroupSize->size() > 3) {
+ exportOp.emitOpError(
+ "reconciled workgroup size is greater than 3 (illegal)");
+ return signalPassFailure();
+ }
+ std::array<int64_t, 3> workgroupSize = {1, 1, 1};
+ for (auto [index, size] : llvm::enumerate(reconciledWorkgroupSize.value())) {
+ workgroupSize[index] = size;
+ }
+ auto workgroupSizeArrayAttr = builder.getIndexArrayAttr(workgroupSize);
+ exportOp.setWorkgroupSizeAttr(workgroupSizeArrayAttr);
+
+ // Reconcile subgroup sizes.
+ FailureOr<int64_t> reconciledSubgroupSize =
+ reconcileSubgroupSize(translationInfos);
+ if (failed(reconciledSubgroupSize)) {
+ exportOp.emitOpError("failed to reconcile subgroup size");
+ return signalPassFailure();
+ }
+ if (reconciledSubgroupSize.value() != int64_t()) {
+ exportOp.setSubgroupSizeAttr(
+ builder.getIndexAttr(reconciledSubgroupSize.value()));
+ }
+
+ // Erase all the lowering configs and translation infos.
+ innerModuleOp->walk([](Operation *op) {
+ if (auto funcOp = dyn_cast<FunctionOpInterface>(op)) {
+ eraseTranslationInfo(funcOp);
+ }
+ eraseLoweringConfig(op);
+ });
+}
+
+std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+createReconcileTranslationInfoPass() {
+ return std::make_unique<ReconcileTranslationInfoPass>();
+}
+
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/RemoveTrivialLoops.cpp b/compiler/src/iree/compiler/Codegen/Common/RemoveTrivialLoops.cpp
index dfef923..7156e26 100644
--- a/compiler/src/iree/compiler/Codegen/Common/RemoveTrivialLoops.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/RemoveTrivialLoops.cpp
@@ -45,18 +45,20 @@
SmallVectorImpl<Value> & /*symbols*/,
ArrayRef<int64_t> workgroupCount,
ArrayRef<int64_t> workgroupSize) {
- if (auto idOp = processorValue.getDefiningOp<gpu::ThreadIdOp>()) {
- unsigned index = dimToIndex(idOp.getDimension());
- OpBuilder b(processorValue.getContext());
- AffineExpr zero = b.getAffineConstantExpr(0);
- AffineExpr ubExpr = b.getAffineConstantExpr(workgroupSize[index]);
- return std::make_pair(zero, ubExpr - 1);
- }
- if (auto dimOp = processorValue.getDefiningOp<gpu::BlockDimOp>()) {
- OpBuilder builder(processorValue.getContext());
- unsigned index = dimToIndex(dimOp.getDimension());
- AffineExpr bound = builder.getAffineConstantExpr(workgroupSize[index]);
- return std::make_pair(bound, bound);
+ if (!workgroupSize.empty()) {
+ if (auto idOp = processorValue.getDefiningOp<gpu::ThreadIdOp>()) {
+ unsigned index = dimToIndex(idOp.getDimension());
+ OpBuilder b(processorValue.getContext());
+ AffineExpr zero = b.getAffineConstantExpr(0);
+ AffineExpr ubExpr = b.getAffineConstantExpr(workgroupSize[index]);
+ return std::make_pair(zero, ubExpr - 1);
+ }
+ if (auto dimOp = processorValue.getDefiningOp<gpu::BlockDimOp>()) {
+ OpBuilder builder(processorValue.getContext());
+ unsigned index = dimToIndex(dimOp.getDimension());
+ AffineExpr bound = builder.getAffineConstantExpr(workgroupSize[index]);
+ return std::make_pair(bound, bound);
+ }
}
if (workgroupCount.empty())
@@ -111,14 +113,16 @@
: public RemoveSingleIterationLoopBase<RemoveSingleIterationLoopPass> {
void runOnOperation() override {
auto funcOp = getOperation();
- FailureOr<IREE::HAL::ExecutableExportOp> exportOp = getEntryPoint(funcOp);
- if (failed(exportOp))
- return;
- SmallVector<int64_t> workgroupSize = getWorkgroupSize(*exportOp);
+ std::optional<SmallVector<int64_t>> workgroupSize =
+ getWorkgroupSize(funcOp);
+ if (!workgroupSize) {
+ return;
+ }
SmallVector<int64_t> numWorkgroups = getStaticNumWorkgroups(funcOp);
- if (failed(removeOneTripTiledLoops(funcOp, workgroupSize, numWorkgroups))) {
+ if (failed(removeOneTripTiledLoops(funcOp, workgroupSize.value(),
+ numWorkgroups))) {
return signalPassFailure();
}
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileAndDistributeToWorkgroupsPass.cpp b/compiler/src/iree/compiler/Codegen/Common/TileAndDistributeToWorkgroupsPass.cpp
index d94d995..0531b5a 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TileAndDistributeToWorkgroupsPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TileAndDistributeToWorkgroupsPass.cpp
@@ -210,9 +210,9 @@
ArrayRef<OpFoldResult> workgroupCount, ArrayRef<int64_t> tileSizes,
ArrayRef<int64_t> staticLoopRanges, ArrayRef<int64_t> interchange,
ArrayRef<unsigned> partitionedLoops, int maxWorkgroupParallelDims) {
- FailureOr<IREE::HAL::ExecutableExportOp> exportOp =
+ std::optional<IREE::HAL::ExecutableExportOp> exportOp =
getEntryPoint(entryPointFn);
- if (failed(exportOp)) {
+ if (!exportOp) {
return entryPointFn.emitOpError(
"expected function to be entry point function");
}
@@ -281,24 +281,18 @@
void TileAndDistributeToWorkgroupsPass::runOnOperation() {
MLIRContext *context = &getContext();
- IREE::HAL::ExecutableVariantOp variantOp = getOperation();
- ModuleOp innerModule = variantOp.getInnerModule();
- llvm::StringMap<IREE::HAL::ExecutableExportOp> entryPoints =
- getAllEntryPoints(innerModule);
- if (maxWorkgroupParallelDims > kNumMaxParallelDims) {
- innerModule.emitError(
- "maxWorkgroupParallelDims set to more than allowed MaxParallelDims");
- }
+ auto funcOp = getOperation();
- for (auto funcOp : innerModule.getOps<mlir::FunctionOpInterface>()) {
- auto exportOp = entryPoints.lookup(funcOp.getName());
- if (!exportOp)
- continue;
-
- Block *body = exportOp.getWorkgroupCountBody();
+ // TODO(MaheshRavishankar): The logic of lowering workgroup count
+ // needs to be moved out of this pass. Once this is moved to
+ // use scf.forall, this logic can be moved to the scf.forall
+ // resolution phase.
+ auto exportOp = getEntryPoint(funcOp);
+ if (exportOp) {
+ Block *body = exportOp->getWorkgroupCountBody();
if (!body) {
- exportOp.emitOpError("unexpected empty workgroup count region");
+ exportOp->emitOpError("unexpected empty workgroup count region");
return signalPassFailure();
}
@@ -312,72 +306,75 @@
return WalkResult::advance();
});
if (!res.wasInterrupted()) {
- continue;
+ return;
}
+ }
- SmallVector<Operation *> computeOps = getComputeOps(funcOp);
- SmallVector<int64_t> tileSizes, staticLoopRanges, interchange;
- SmallVector<unsigned> partitionableLoops;
- Operation *dispatchRootOp = nullptr;
- if (failed(getTileAndDistributeConfig(computeOps, dispatchRootOp, tileSizes,
- staticLoopRanges, interchange,
- partitionableLoops))) {
- funcOp.emitOpError("failed to get tile and distribute configuration");
+ SmallVector<Operation *> computeOps = getComputeOps(funcOp);
+ SmallVector<int64_t> tileSizes, staticLoopRanges, interchange;
+ SmallVector<unsigned> partitionableLoops;
+ Operation *dispatchRootOp = nullptr;
+ if (failed(getTileAndDistributeConfig(computeOps, dispatchRootOp, tileSizes,
+ staticLoopRanges, interchange,
+ partitionableLoops))) {
+ funcOp.emitOpError("failed to get tile and distribute configuration");
+ return signalPassFailure();
+ }
+
+ IRRewriter rewriter(context);
+
+ // If there are no compute ops, nothing more to do.
+ if (!dispatchRootOp || computeOps.empty()) {
+ if (exportOp && failed(lowerWorkgroupCount(
+ rewriter, funcOp,
+ /*workgroupCountVals =*/ArrayRef<OpFoldResult>{},
+ /*tileSizes =*/ArrayRef<int64_t>{},
+ /*staticLoopRanges =*/ArrayRef<int64_t>{},
+ /*interchange =*/ArrayRef<int64_t>{},
+ /*partitionedLoops =*/ArrayRef<unsigned>{},
+ maxWorkgroupParallelDims))) {
+ funcOp.emitOpError(
+ "failed to lower workgroup count region when no compute ops in the "
+ "dispatch");
return signalPassFailure();
}
+ return;
+ }
- IRRewriter rewriter(context);
- // If there are no compute ops, nothing more to do.
- if (!dispatchRootOp || computeOps.empty()) {
- if (failed(lowerWorkgroupCount(
- rewriter, funcOp,
- /*workgroupCountVals =*/ArrayRef<OpFoldResult>{},
- /*tileSizes =*/ArrayRef<int64_t>{},
- /*staticLoopRanges =*/ArrayRef<int64_t>{},
- /*interchange =*/ArrayRef<int64_t>{},
- /*partitionedLoops =*/ArrayRef<unsigned>{},
- maxWorkgroupParallelDims))) {
- funcOp.emitOpError(
- "failed to lower workgroup count region when no compute ops in the "
- "dispatch");
- return signalPassFailure();
- }
- continue;
- }
+ // Configure the linalg options.
+ // Tile size selection function.
+ auto tileSizeFn = [&](OpBuilder &builder,
+ Operation *op) -> SmallVector<Value> {
+ // Check if tile sizes are deduced from the configuration. If so use
+ // those.
+ return llvm::map_to_vector(tileSizes, [&](int64_t ts) -> Value {
+ return builder.create<arith::ConstantIndexOp>(op->getLoc(), ts);
+ });
+ };
- // Configure the linalg options.
- // Tile size selection function.
- auto tileSizeFn = [&](OpBuilder &builder,
- Operation *op) -> SmallVector<Value> {
- // Check if tile sizes are deduced from the configuration. If so use
- // those.
- return llvm::map_to_vector(tileSizes, [&](int64_t ts) -> Value {
- return builder.create<arith::ConstantIndexOp>(op->getLoc(), ts);
- });
- };
+ linalg::DistributionMethod distributionMethodValue =
+ (linalg::DistributionMethod)(distributionMethod.getValue());
+ auto linalgTilingOptions =
+ linalg::LinalgTilingOptions()
+ .setDistributionOptions(getIREELinalgLoopDistributionOptions(
+ tileSizes, distributionMethodValue, maxWorkgroupParallelDims))
+ .setInterchange(llvm::map_to_vector(
+ interchange,
+ [](int64_t v) -> unsigned { return static_cast<unsigned>(v); }))
+ .setLoopType(linalg::LinalgTilingLoopType::Loops)
+ .setTileSizeComputationFunction(tileSizeFn);
- linalg::DistributionMethod distributionMethodValue =
- (linalg::DistributionMethod)(distributionMethod.getValue());
- auto linalgTilingOptions =
- linalg::LinalgTilingOptions()
- .setDistributionOptions(getIREELinalgLoopDistributionOptions(
- tileSizes, distributionMethodValue, maxWorkgroupParallelDims))
- .setInterchange(llvm::map_to_vector(
- interchange,
- [](int64_t v) -> unsigned { return static_cast<unsigned>(v); }))
- .setLoopType(linalg::LinalgTilingLoopType::Loops)
- .setTileSizeComputationFunction(tileSizeFn);
+ FailureOr<IREETileAndFuseResult> tileAndFuseResult =
+ tileAndFuseDispatchUsingSCFForOp(rewriter,
+ cast<TilingInterface>(computeOps.back()),
+ linalgTilingOptions);
+ if (failed(tileAndFuseResult)) {
+ funcOp.emitOpError("Tile+Distribute failed");
+ return signalPassFailure();
+ }
- FailureOr<IREETileAndFuseResult> tileAndFuseResult =
- tileAndFuseDispatchUsingSCFForOp(
- rewriter, cast<TilingInterface>(computeOps.back()),
- linalgTilingOptions);
- if (failed(tileAndFuseResult)) {
- funcOp.emitOpError("Tile+Distribute failed");
- return signalPassFailure();
- }
-
- // Materialize the computation for workgroup counts.
+ // Materialize the computation for workgroup counts.
+ if (exportOp) {
auto workgroupCountOfr =
getAsOpFoldResult(tileAndFuseResult->workgroupCount);
if (failed(lowerWorkgroupCount(
@@ -391,66 +388,66 @@
{
RewritePatternSet patterns(exportOp->getContext());
memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
- if (failed(applyPatternsAndFoldGreedily(exportOp, std::move(patterns)))) {
- exportOp.emitOpError("`tensor.dim` resolution in exportOp failed");
+ if (failed(applyPatternsAndFoldGreedily(exportOp.value(),
+ std::move(patterns)))) {
+ exportOp->emitOpError("`tensor.dim` resolution in exportOp failed");
return signalPassFailure();
}
}
+ }
- {
- RewritePatternSet patterns(context);
- populateTileAndDistributeToWorkgroupsCleanupPatterns(patterns,
- linalgTilingOptions);
- if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
- funcOp.emitOpError("Tile+Distribute clean up patterns failed");
- return signalPassFailure();
- }
- }
-
- LLVM_DEBUG({
- llvm::dbgs() << "--- After Tile + Distribute ---\n";
- funcOp.print(llvm::dbgs(), OpPrintingFlags().useLocalScope());
- llvm::dbgs() << "\n\n";
- });
-
- {
- SmallVector<int64_t> staticNumWorkgroup = getStaticNumWorkgroups(funcOp);
- // Apply linalg tiling optimization patterns, which includes folding
- // casting ops into tiled operations.
- RewritePatternSet patterns(context);
- linalg::populateLinalgTilingCanonicalizationPatterns(patterns);
- tensor::populateFoldTensorEmptyPatterns(patterns);
- populateFoldAffineMinInDistributedLoopsPatterns(patterns,
- staticNumWorkgroup);
- context->getOrLoadDialect<tensor::TensorDialect>()
- ->getCanonicalizationPatterns(patterns);
- context->getOrLoadDialect<IREE::LinalgExt::IREELinalgExtDialect>()
- ->getCanonicalizationPatterns(patterns);
- if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
- funcOp.emitOpError("tiling canonicalizations failed");
- return signalPassFailure();
- }
- }
-
- LLVM_DEBUG({
- llvm::dbgs() << "--- After Canonicalize ---\n";
- funcOp.print(llvm::dbgs(), OpPrintingFlags().useLocalScope());
- llvm::dbgs() << "\n\n";
- });
-
- // After rewriting destructive updates, there might be uses of compute
- // operations only in `tensor.dim` ops. Resolve these.
- RewritePatternSet resolveDimOps(context);
- memref::populateResolveRankedShapedTypeResultDimsPatterns(resolveDimOps);
- if (failed(
- applyPatternsAndFoldGreedily(funcOp, std::move(resolveDimOps)))) {
- funcOp.emitOpError("resolving ranked shaped results dims failed");
+ {
+ RewritePatternSet patterns(context);
+ populateTileAndDistributeToWorkgroupsCleanupPatterns(patterns,
+ linalgTilingOptions);
+ if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
+ funcOp.emitOpError("Tile+Distribute clean up patterns failed");
return signalPassFailure();
}
}
+
+ LLVM_DEBUG({
+ llvm::dbgs() << "--- After Tile + Distribute ---\n";
+ funcOp.print(llvm::dbgs(), OpPrintingFlags().useLocalScope());
+ llvm::dbgs() << "\n\n";
+ });
+
+ {
+ SmallVector<int64_t> staticNumWorkgroup = getStaticNumWorkgroups(funcOp);
+ // Apply linalg tiling optimization patterns, which includes folding
+ // casting ops into tiled operations.
+ RewritePatternSet patterns(context);
+ linalg::populateLinalgTilingCanonicalizationPatterns(patterns);
+ tensor::populateFoldTensorEmptyPatterns(patterns);
+ populateFoldAffineMinInDistributedLoopsPatterns(patterns,
+ staticNumWorkgroup);
+ context->getOrLoadDialect<tensor::TensorDialect>()
+ ->getCanonicalizationPatterns(patterns);
+ context->getOrLoadDialect<IREE::LinalgExt::IREELinalgExtDialect>()
+ ->getCanonicalizationPatterns(patterns);
+ if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
+ funcOp.emitOpError("tiling canonicalizations failed");
+ return signalPassFailure();
+ }
+ }
+
+ LLVM_DEBUG({
+ llvm::dbgs() << "--- After Canonicalize ---\n";
+ funcOp.print(llvm::dbgs(), OpPrintingFlags().useLocalScope());
+ llvm::dbgs() << "\n\n";
+ });
+
+ // After rewriting destructive updates, there might be uses of compute
+ // operations only in `tensor.dim` ops. Resolve these.
+ RewritePatternSet resolveDimOps(context);
+ memref::populateResolveRankedShapedTypeResultDimsPatterns(resolveDimOps);
+ if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(resolveDimOps)))) {
+ funcOp.emitOpError("resolving ranked shaped results dims failed");
+ return signalPassFailure();
+ }
}
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
createTileAndDistributeToWorkgroupsPass(
int32_t maxWorkgroupParallelDims,
linalg::DistributionMethod distributionMethod) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.cpp b/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.cpp
index cc421da..7773af8 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.cpp
@@ -171,6 +171,9 @@
case 1:
// Only distribution level.
return {0};
+ case 3:
+ // Only distribution level + vector common parallel levels.
+ return {0, 1};
case 4:
// Distribution + vector common parallel levels + vector inner parallel
// levels.
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.h b/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.h
index 0ec1180..86bd71b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.h
+++ b/compiler/src/iree/compiler/Codegen/Common/TileSizeSelection.h
@@ -22,9 +22,10 @@
/// We currently support the following scenarios:
/// 1. [[distribution]]
/// 2. [[distribution], [vector-common-parallel]]
-/// 3. [[distribution], [vector-common-parallel], [vector-reduction],
+/// 3. [[distribution], [vector-common-parallel], [vector-reduction]]
+/// 4. [[distribution], [vector-common-parallel], [vector-reduction],
/// [vector-inner-parallel]]
-/// 4. [[distribution], [cache-parallel], [cache-reduction],
+/// 5. [[distribution], [cache-parallel], [cache-reduction],
/// [vector-parallel], [vector-reduction]]
class TilingConfig {
public:
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
index 26e3c7e..98ddadc 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
@@ -393,8 +393,7 @@
//===---------------------------------------------------------------------===//
LogicalResult rewriteForallToWorkgroup(RewriterBase &rewriter,
- scf::ForallOp forallOp,
- IREE::HAL::ExecutableExportOp exportOp) {
+ scf::ForallOp forallOp) {
// Step 0. Target-specific verifications. There is no good place to anchor
// those right now: the ForallOp is target-independent and the
// transform op does not apply to individual ForallOp.
@@ -496,23 +495,6 @@
transform::TransformRewriter &rewriter, mlir::FunctionOpInterface target,
transform::ApplyToEachResultList &results,
transform::TransformState &state) {
- if (!isa<HAL::ExecutableOp, HAL::ExecutableVariantOp>(state.getTopLevel())) {
- return mlir::emitDefiniteFailure(
- state.getTopLevel(),
- "requires HAL::ExecutableOp or HAL::ExecutableVariantOp toplevel "
- "to attach the workgroup size information to a nested "
- "ExecutableExportOp");
- }
-
- IREE::HAL::ExecutableExportOp exportOp;
- state.getTopLevel()->walk([&](IREE::HAL::ExecutableExportOp op) {
- if (op.getSymName() == target.getName())
- exportOp = op;
- });
- if (!exportOp) {
- return mlir::emitSilenceableFailure(
- target, "no IREE::HAL::ExecutableExportOp found");
- }
scf::ForallOp topLevelForallOp;
auto walkResult = target->walk([&](scf::ForallOp forallOp) {
@@ -530,7 +512,7 @@
}
rewriter.setInsertionPoint(topLevelForallOp);
- if (failed(rewriteForallToWorkgroup(rewriter, topLevelForallOp, exportOp)))
+ if (failed(rewriteForallToWorkgroup(rewriter, topLevelForallOp)))
return mlir::emitDefiniteFailure(target, "rewriteForallToWorkgroup failed");
return DiagnosedSilenceableFailure::success();
@@ -760,19 +742,7 @@
transform::TransformRewriter &rewriter,
transform::TransformResults &results, transform::TransformState &state) {
auto payload = state.getPayloadOps(getTarget());
- if (!llvm::hasSingleElement(payload) ||
- !isa<ModuleOp, HAL::ExecutableOp, HAL::ExecutableVariantOp>(
- *payload.begin())) {
- return mlir::emitDefiniteFailure(
- state.getTopLevel(), "requires exactly a single HAL::ExecutableOp or "
- "HAL::ExecutableVariantOp target op.");
- }
- //===-------------------------------------------------------------------===//
- // DO NOT JUST CALL `addIREEComprehensiveBufferizePasses` as this results in
- // a lot of registration issues due to nested pass pipeline mess.
- // Instead, take what we need from it.
- //===-------------------------------------------------------------------===//
// Bufferize the dispatch.
using mlir::bufferization::BufferizationOptions;
BufferizationOptions::AllocationFn allocationFn =
@@ -793,15 +763,10 @@
config.listener = &listener;
// Manually gather list of ops because the other GreedyPatternRewriteDriver
// overloads only accepts ops that are isolated from above.
- SmallVector<Operation *> ops;
- state.getTopLevel()->walk([&](Operation *nestedOp) {
- if (state.getTopLevel() != nestedOp)
- ops.push_back(nestedOp);
- });
LogicalResult result =
- applyOpPatternsAndFold(ops, std::move(patterns), config);
+ applyOpPatternsAndFold(target, std::move(patterns), config);
if (failed(result)) {
- return mlir::emitDefiniteFailure(state.getTopLevel(),
+ return mlir::emitDefiniteFailure(target,
"greedy pattern application failed");
}
if (listener.failed())
@@ -814,9 +779,18 @@
options.memCpyFn = memCpyFn;
options.testAnalysisOnly = getTestAnalysisOnly();
options.printConflicts = getPrintConflicts();
- if (failed(runIREEOneShotBufferize(state.getTopLevel(), options)))
- return mlir::emitDefiniteFailure(state.getTopLevel(),
- "bufferization failed");
+
+ if (getTargetGpu()) {
+ options.defaultMemorySpaceFn =
+ [&](TensorType t) -> std::optional<Attribute> {
+ Attribute addressSpaceAttr = gpu::AddressSpaceAttr::get(
+ t.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace());
+ return addressSpaceAttr;
+ };
+ }
+ if (failed(runIREEOneShotBufferize(target, options))) {
+ return mlir::emitDefiniteFailure(target, "bufferization failed");
+ }
// Early exit if test_analysis_only is set.
if (getTestAnalysisOnly()) {
@@ -827,21 +801,12 @@
// 3. Post-bufferization passes are fine.
PassManager pm(getContext());
addIREEPostBufferizationPasses(pm);
- WalkResult res = state.getTopLevel()->walk([&](ModuleOp moduleOp) {
- if (failed(pm.run(moduleOp))) {
- getOperation()->emitError()
- << "failed to post-bufferization passes on module:\n"
- << *(moduleOp.getOperation()) << "\nunder top-level:\n"
- << *state.getTopLevel();
- return WalkResult::interrupt();
- }
- return WalkResult::advance();
- });
- if (res.wasInterrupted())
+ if (failed(pm.run(target))) {
return mlir::emitDefiniteFailure(target)
<< "post-bufferization passes failed";
+ }
- results.set(getOperation()->getOpResult(0), {*payload.begin()});
+ results.set(getOperation()->getOpResult(0), {target});
return listener.checkAndResetError();
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/UserConfig.cpp b/compiler/src/iree/compiler/Codegen/Common/UserConfig.cpp
index 2103710..2ce80a8 100644
--- a/compiler/src/iree/compiler/Codegen/Common/UserConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/UserConfig.cpp
@@ -22,12 +22,6 @@
if (failed(setTranslationInfo(entryPointFn, info)))
return failure();
- SmallVector<int64_t> workgroupSize = compilationInfo.getWorkgroupSizeVals();
- std::optional<int64_t> subgroupSize = compilationInfo.getSubgroupSize();
- if (failed(setDispatchConfig(entryPointFn, workgroupSize, subgroupSize))) {
- return failure();
- }
-
setLoweringConfig(computeOp, compilationInfo.getLoweringConfig());
eraseCompilationInfo(computeOp);
return success();
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
index d55b1c9..77fb37e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
@@ -53,6 +53,7 @@
"materialize_user_configs.mlir",
"pad_dynamic_alloc.mlir",
"polynomial_approximation.mlir",
+ "reconcile_translation_info.mlir",
"reductions.mlir",
"rematerialize_parallel_ops.mlir",
"remove_dead_allocs.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
index 8cc7325..6baece8 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt
@@ -49,6 +49,7 @@
"materialize_user_configs.mlir"
"pad_dynamic_alloc.mlir"
"polynomial_approximation.mlir"
+ "reconcile_translation_info.mlir"
"reductions.mlir"
"rematerialize_parallel_ops.mlir"
"remove_dead_allocs.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/bufferize_copy_only_dispatches.mlir b/compiler/src/iree/compiler/Codegen/Common/test/bufferize_copy_only_dispatches.mlir
index 5e0e145..1d9a1cb 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/bufferize_copy_only_dispatches.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/bufferize_copy_only_dispatches.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --iree-codegen-bufferize-copy-only-dispatches --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-bufferize-copy-only-dispatches))" --split-input-file %s | FileCheck %s
builtin.module {
func.func @tensor_insert_slice() {
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/eliminate_empty_tensors.mlir b/compiler/src/iree/compiler/Codegen/Common/test/eliminate_empty_tensors.mlir
index 00d13e8..cc7600b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/eliminate_empty_tensors.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/eliminate_empty_tensors.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-eliminate-empty-tensors)" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-eliminate-empty-tensors))" %s | FileCheck %s
// -----
func.func @eliminate_empty_tensors_with_store_op() {
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir b/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
index c741a2b..25ede6c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt %s --iree-codegen-iree-comprehensive-bufferize --canonicalize -cse --canonicalize --split-input-file | FileCheck %s
+// RUN: iree-opt %s --pass-pipeline="builtin.module(func.func(iree-codegen-iree-comprehensive-bufferize, canonicalize, cse, canonicalize))" --split-input-file | FileCheck %s
func.func @matmul() {
%c0 = arith.constant 0 : index
@@ -265,31 +265,30 @@
// -----
-module {
- func.func @tile_from_tensor_load_inplace() {
- %c2 = arith.constant 2 : index
- %c4 = arith.constant 4 : index
- %c1 = arith.constant 1 : index
- %0 = hal.interface.constant.load[0] : index
- %1 = hal.interface.constant.load[1] : index
- %2 = hal.interface.constant.load[2] : index
- %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
- %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
- %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- scf.for %arg0 = %workgroup_id_y to %c2 step %c2 {
- scf.for %arg1 = %workgroup_id_x to %c4 step %c4 {
- %6 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [1, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<1x3xf32>
- %7 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<3x1xf32>
- %8 = flow.dispatch.tensor.load %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1} -> tensor<1x1xf32>
- %9 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
- flow.dispatch.tensor.store %9, %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
- }
+func.func @tile_from_tensor_load_inplace() {
+ %c2 = arith.constant 2 : index
+ %c4 = arith.constant 4 : index
+ %c1 = arith.constant 1 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
+ %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ scf.for %arg0 = %workgroup_id_y to %c2 step %c2 {
+ scf.for %arg1 = %workgroup_id_x to %c4 step %c4 {
+ %6 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [1, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<1x3xf32>
+ %7 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<3x1xf32>
+ %8 = flow.dispatch.tensor.load %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1} -> tensor<1x1xf32>
+ %9 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
+ flow.dispatch.tensor.store %9, %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
}
- return
}
+ return
}
+
// CHECK-LABEL: func.func @tile_from_tensor_load_inplace()
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
@@ -305,33 +304,32 @@
// -----
-module {
- func.func @tile_from_tensor_load_inplace_and_copy() {
- %c2 = arith.constant 2 : index
- %c4 = arith.constant 4 : index
- %c1 = arith.constant 1 : index
- %0 = hal.interface.constant.load[0] : index
- %1 = hal.interface.constant.load[1] : index
- %2 = hal.interface.constant.load[2] : index
- %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
- %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
- %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
- %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- scf.for %arg0 = %workgroup_id_y to %c2 step %c2 {
- scf.for %arg1 = %workgroup_id_x to %c4 step %c4 {
- %7 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [1, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<1x3xf32>
- %8 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<3x1xf32>
- %9 = flow.dispatch.tensor.load %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1} -> tensor<1x1xf32>
- %10 = linalg.matmul ins(%7, %8 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32>
- flow.dispatch.tensor.store %10, %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
- flow.dispatch.tensor.store %10, %6, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
- }
+func.func @tile_from_tensor_load_inplace_and_copy() {
+ %c2 = arith.constant 2 : index
+ %c4 = arith.constant 4 : index
+ %c1 = arith.constant 1 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
+ %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
+ %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ scf.for %arg0 = %workgroup_id_y to %c2 step %c2 {
+ scf.for %arg1 = %workgroup_id_x to %c4 step %c4 {
+ %7 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [1, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<1x3xf32>
+ %8 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<3x1xf32>
+ %9 = flow.dispatch.tensor.load %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1} -> tensor<1x1xf32>
+ %10 = linalg.matmul ins(%7, %8 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32>
+ flow.dispatch.tensor.store %10, %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
+ flow.dispatch.tensor.store %10, %6, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
}
- return
}
+ return
}
+
// CHECK-LABEL: func.func @tile_from_tensor_load_inplace_and_copy()
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
@@ -346,42 +344,41 @@
// CHECK-SAME: ins(%[[LHS]], %[[RHS]]
// CHECK-SAME: outs(%[[RESULT1]]
// CHECK: %[[RESULT2:.+]] = memref.subview %[[RETURN2]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
-// CHECK: linalg.generic {{.*}} ins(%[[RESULT1]] {{.*}} outs(%[[RESULT2]]
+// CHECK: linalg.generic {{.*}} ins(%[[RESULT1]] {{.*}} outs(%[[RESULT2]]
// -----
#map = affine_map<(d0, d1) -> (d0, d1)>
-module {
- func.func @tile_from_pointwise_lhs_inplace() {
- %c2 = arith.constant 2 : index
- %c4 = arith.constant 4 : index
- %c1 = arith.constant 1 : index
- %0 = hal.interface.constant.load[0] : index
- %1 = hal.interface.constant.load[1] : index
- %2 = hal.interface.constant.load[2] : index
- %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
- %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
- %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- scf.for %arg0 = %workgroup_id_y to %c2 step %c2 {
- scf.for %arg1 = %workgroup_id_x to %c4 step %c4 {
- %6 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [1, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<1x3xf32>
- %7 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<3x1xf32>
- %8 = bufferization.alloc_tensor() : tensor<1x3xf32>
- %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<1x3xf32>) outs(%8 : tensor<1x3xf32>) {
- ^bb0(%arg2: f32, %arg3: f32):
- %12 = arith.addf %arg2, %arg2 : f32
- linalg.yield %12 : f32
- } -> tensor<1x3xf32>
- %10 = flow.dispatch.tensor.load %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1} -> tensor<1x1xf32>
- %11 = linalg.matmul ins(%9, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%10 : tensor<1x1xf32>) -> tensor<1x1xf32>
- flow.dispatch.tensor.store %11, %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
- }
+func.func @tile_from_pointwise_lhs_inplace() {
+ %c2 = arith.constant 2 : index
+ %c4 = arith.constant 4 : index
+ %c1 = arith.constant 1 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
+ %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ scf.for %arg0 = %workgroup_id_y to %c2 step %c2 {
+ scf.for %arg1 = %workgroup_id_x to %c4 step %c4 {
+ %6 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [1, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<1x3xf32>
+ %7 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<3x1xf32>
+ %8 = bufferization.alloc_tensor() : tensor<1x3xf32>
+ %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<1x3xf32>) outs(%8 : tensor<1x3xf32>) {
+ ^bb0(%arg2: f32, %arg3: f32):
+ %12 = arith.addf %arg2, %arg2 : f32
+ linalg.yield %12 : f32
+ } -> tensor<1x3xf32>
+ %10 = flow.dispatch.tensor.load %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1} -> tensor<1x1xf32>
+ %11 = linalg.matmul ins(%9, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%10 : tensor<1x1xf32>) -> tensor<1x1xf32>
+ flow.dispatch.tensor.store %11, %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
}
- return
}
+ return
}
+
// CHECK-LABEL: func.func @tile_from_pointwise_lhs_inplace()
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
@@ -402,37 +399,35 @@
// -----
#map = affine_map<(d0, d1) -> (d0, d1)>
-module {
- func.func @tile_from_pointwise_outs() {
- %c2 = arith.constant 2 : index
- %c4 = arith.constant 4 : index
- %c1 = arith.constant 1 : index
- %0 = hal.interface.constant.load[0] : index
- %1 = hal.interface.constant.load[1] : index
- %2 = hal.interface.constant.load[2] : index
- %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
- %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
- %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
- %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- scf.for %arg0 = %workgroup_id_y to %c2 step %c2 {
- scf.for %arg1 = %workgroup_id_x to %c4 step %c4 {
- %7 = flow.dispatch.tensor.load %6, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1} -> tensor<1x1xf32>
- %8 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [1, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<1x3xf32>
- %9 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<3x1xf32>
- %10 = flow.dispatch.tensor.load %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<1x1xf32>
- %11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<1x1xf32>) outs(%7 : tensor<1x1xf32>) {
- ^bb0(%arg2: f32, %arg3: f32):
- %13 = arith.addf %arg2, %arg2 : f32
- linalg.yield %13 : f32
- } -> tensor<1x1xf32>
- %12 = linalg.matmul ins(%8, %9 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%11 : tensor<1x1xf32>) -> tensor<1x1xf32>
- flow.dispatch.tensor.store %12, %6, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
- }
+func.func @tile_from_pointwise_outs() {
+ %c2 = arith.constant 2 : index
+ %c4 = arith.constant 4 : index
+ %c1 = arith.constant 1 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
+ %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
+ %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ scf.for %arg0 = %workgroup_id_y to %c2 step %c2 {
+ scf.for %arg1 = %workgroup_id_x to %c4 step %c4 {
+ %7 = flow.dispatch.tensor.load %6, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1} -> tensor<1x1xf32>
+ %8 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [1, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<1x3xf32>
+ %9 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<3x1xf32>
+ %10 = flow.dispatch.tensor.load %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<1x1xf32>
+ %11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<1x1xf32>) outs(%7 : tensor<1x1xf32>) {
+ ^bb0(%arg2: f32, %arg3: f32):
+ %13 = arith.addf %arg2, %arg2 : f32
+ linalg.yield %13 : f32
+ } -> tensor<1x1xf32>
+ %12 = linalg.matmul ins(%8, %9 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%11 : tensor<1x1xf32>) -> tensor<1x1xf32>
+ flow.dispatch.tensor.store %12, %6, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
}
- return
}
+ return
}
// CHECK-LABEL: func.func @tile_from_pointwise_outs()
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
@@ -455,40 +450,36 @@
// -----
#map = affine_map<(d0, d1) -> (d0, d1)>
-module {
- func.func @tile_from_pointwise_outs_inplace() {
- %cst = arith.constant 1.000000e+00 : f32
- %c2 = arith.constant 2 : index
- %c4 = arith.constant 4 : index
- %c1 = arith.constant 1 : index
- %0 = hal.interface.constant.load[0] : index
- %1 = hal.interface.constant.load[1] : index
- %2 = hal.interface.constant.load[2] : index
- %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
- %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
- %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- scf.for %arg0 = %workgroup_id_y to %c2 step %c2 {
- scf.for %arg1 = %workgroup_id_x to %c4 step %c4 {
- %6 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [1, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<1x3xf32>
- %7 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<3x1xf32>
- %8 = flow.dispatch.tensor.load %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1} -> tensor<1x1xf32>
- %9 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel"]} outs(%8 : tensor<1x1xf32>) {
- ^bb0(%arg2: f32):
- %11 = arith.addf %arg2, %cst : f32
- linalg.yield %11 : f32
- } -> tensor<1x1xf32>
- %10 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32>
- flow.dispatch.tensor.store %10, %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
- }
+func.func @tile_from_pointwise_outs_inplace() {
+ %cst = arith.constant 1.000000e+00 : f32
+ %c2 = arith.constant 2 : index
+ %c4 = arith.constant 4 : index
+ %c1 = arith.constant 1 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
+ %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ scf.for %arg0 = %workgroup_id_y to %c2 step %c2 {
+ scf.for %arg1 = %workgroup_id_x to %c4 step %c4 {
+ %6 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [1, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<1x3xf32>
+ %7 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<3x1xf32>
+ %8 = flow.dispatch.tensor.load %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1} -> tensor<1x1xf32>
+ %9 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel"]} outs(%8 : tensor<1x1xf32>) {
+ ^bb0(%arg2: f32):
+ %11 = arith.addf %arg2, %cst : f32
+ linalg.yield %11 : f32
+ } -> tensor<1x1xf32>
+ %10 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32>
+ flow.dispatch.tensor.store %10, %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
}
- return
}
+ return
}
-// -----
-
// CHECK-LABEL: func.func @tile_from_pointwise_outs_inplace()
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
@@ -506,32 +497,31 @@
// -----
-module {
- func.func @tile_from_matmul_outs_inplace() {
- %c2 = arith.constant 2 : index
- %c4 = arith.constant 4 : index
- %c1 = arith.constant 1 : index
- %0 = hal.interface.constant.load[0] : index
- %1 = hal.interface.constant.load[1] : index
- %2 = hal.interface.constant.load[2] : index
- %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
- %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
- %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- scf.for %arg0 = %workgroup_id_y to %c2 step %c2 {
- scf.for %arg1 = %workgroup_id_x to %c4 step %c4 {
- %6 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [1, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<1x3xf32>
- %7 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<3x1xf32>
- %8 = flow.dispatch.tensor.load %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1} -> tensor<1x1xf32>
- %9 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
- %10 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32>
- flow.dispatch.tensor.store %10, %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
- }
+func.func @tile_from_matmul_outs_inplace() {
+ %c2 = arith.constant 2 : index
+ %c4 = arith.constant 4 : index
+ %c1 = arith.constant 1 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
+ %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ scf.for %arg0 = %workgroup_id_y to %c2 step %c2 {
+ scf.for %arg1 = %workgroup_id_x to %c4 step %c4 {
+ %6 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [1, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<1x3xf32>
+ %7 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [3, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<3x1xf32>
+ %8 = flow.dispatch.tensor.load %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1} -> tensor<1x1xf32>
+ %9 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
+ %10 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32>
+ flow.dispatch.tensor.store %10, %5, offsets = [%arg0, %arg1], sizes = [1, 1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%0, %1}
}
- return
}
+ return
}
+
// CHECK-LABEL: func.func @tile_from_matmul_outs_inplace()
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
@@ -550,44 +540,43 @@
#map0 = affine_map<(d0)[s0, s1] -> (-d0 + s0, s1)>
#map1 = affine_map<(d0)[s0, s1] -> (-d0 + s1, s0)>
-module {
- func.func @bufferize_dynamic_inplace() {
- %c1 = arith.constant 1 : index
- %0 = hal.interface.constant.load[0] : index
- %1 = hal.interface.constant.load[1] : index
- %2 = hal.interface.constant.load[2] : index
- %3 = hal.interface.constant.load[3] : index
- %4 = hal.interface.constant.load[4] : index
- %5 = hal.interface.constant.load[5] : index
- %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
- %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3}
- %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%4, %5}
- %workgroup_size_x = hal.interface.workgroup.size[0] : index
- %workgroup_size_y = hal.interface.workgroup.size[1] : index
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %9 = arith.muli %workgroup_size_y, %workgroup_id_y : index
- %10 = arith.muli %workgroup_size_y, %workgroup_count_y : index
- scf.for %arg0 = %9 to %0 step %10 {
- %11 = arith.muli %workgroup_size_x, %workgroup_id_x : index
- %12 = arith.muli %workgroup_size_x, %workgroup_count_x : index
- scf.for %arg1 = %11 to %3 step %12 {
- %13 = affine.min #map0(%arg0)[%0, %workgroup_size_y]
- %14 = flow.dispatch.tensor.load %6, offsets = [%arg0, 0], sizes = [%13, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
- %15 = affine.min #map0(%arg1)[%3, %workgroup_size_x]
- %16 = flow.dispatch.tensor.load %7, offsets = [0, %arg1], sizes = [%2, %15], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3} -> tensor<?x?xf32>
- %17 = affine.min #map1(%arg0)[%workgroup_size_y, %4]
- %18 = affine.min #map1(%arg1)[%workgroup_size_x, %5]
- %19 = flow.dispatch.tensor.load %8, offsets = [%arg0, %arg1], sizes = [%17, %18], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%4, %5} -> tensor<?x?xf32>
- %20 = linalg.matmul ins(%14, %16 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%19 : tensor<?x?xf32>) -> tensor<?x?xf32>
- flow.dispatch.tensor.store %20, %8, offsets = [%arg0, %arg1], sizes = [%17, %18], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%4, %5}
- }
+func.func @bufferize_dynamic_inplace() {
+ %c1 = arith.constant 1 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.constant.load[3] : index
+ %4 = hal.interface.constant.load[4] : index
+ %5 = hal.interface.constant.load[5] : index
+ %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
+ %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3}
+ %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%4, %5}
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %9 = arith.muli %workgroup_size_y, %workgroup_id_y : index
+ %10 = arith.muli %workgroup_size_y, %workgroup_count_y : index
+ scf.for %arg0 = %9 to %0 step %10 {
+ %11 = arith.muli %workgroup_size_x, %workgroup_id_x : index
+ %12 = arith.muli %workgroup_size_x, %workgroup_count_x : index
+ scf.for %arg1 = %11 to %3 step %12 {
+ %13 = affine.min #map0(%arg0)[%0, %workgroup_size_y]
+ %14 = flow.dispatch.tensor.load %6, offsets = [%arg0, 0], sizes = [%13, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
+ %15 = affine.min #map0(%arg1)[%3, %workgroup_size_x]
+ %16 = flow.dispatch.tensor.load %7, offsets = [0, %arg1], sizes = [%2, %15], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3} -> tensor<?x?xf32>
+ %17 = affine.min #map1(%arg0)[%workgroup_size_y, %4]
+ %18 = affine.min #map1(%arg1)[%workgroup_size_x, %5]
+ %19 = flow.dispatch.tensor.load %8, offsets = [%arg0, %arg1], sizes = [%17, %18], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%4, %5} -> tensor<?x?xf32>
+ %20 = linalg.matmul ins(%14, %16 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%19 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %20, %8, offsets = [%arg0, %arg1], sizes = [%17, %18], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%4, %5}
}
- return
}
+ return
}
+
// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (-d0 + s0, s1)>
// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (-d0 + s1, s0)>
// CHECK: func.func @bufferize_dynamic_inplace()
@@ -2160,18 +2149,17 @@
// -----
-module {
- func.func @rank_reducing_no_op_subview() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : index
- %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x?xf32>>{%0}
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%0}
- %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x?xf32>>{%0} -> tensor<1x?xf32>
- %4 = tensor.extract_slice %3[0, 0] [1, %0] [1, 1] : tensor<1x?xf32> to tensor<?xf32>
- flow.dispatch.tensor.store %4, %2, offsets = [0], sizes = [%0], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%0}
- return
- }
+func.func @rank_reducing_no_op_subview() {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x?xf32>>{%0}
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%0}
+ %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x?xf32>>{%0} -> tensor<1x?xf32>
+ %4 = tensor.extract_slice %3[0, 0] [1, %0] [1, 1] : tensor<1x?xf32> to tensor<?xf32>
+ flow.dispatch.tensor.store %4, %2, offsets = [0], sizes = [%0], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%0}
+ return
}
+
// CHECK-LABEL: func.func @rank_reducing_no_op_subview()
// CHECK-DAG: %[[SRC:.+]] = hal.interface.binding.subspan set(0) binding(0)
// CHECK-DAG: %[[DEST:.+]] = hal.interface.binding.subspan set(0) binding(1)
@@ -2444,34 +2432,32 @@
// -----
-module {
- func.func @reduction_ew() {
- %c5120 = arith.constant 5120 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %cst_0 = arith.constant 1.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c5120) : !flow.dispatch.tensor<readonly:tensor<1001xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c5120) : !flow.dispatch.tensor<readonly:tensor<1x1001xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1001xf32>>
- %3 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1001], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1001xf32>> -> tensor<1x1001xf32>
- %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1001], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1001xf32>> -> tensor<1001xf32>
- %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 1001], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1001xf32>> -> tensor<1x1001xf32>
- %6 = bufferization.alloc_tensor() : tensor<f32>
- %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<f32>) -> tensor<f32>
- %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%4 : tensor<1001xf32>) outs(%7 : tensor<f32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0]]>} {
- ^bb0(%arg0: f32, %arg1: f32):
- %10 = arith.addf %arg0, %arg1 : f32
- linalg.yield %10 : f32
- } -> tensor<f32>
- %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5, %8 : tensor<1x1001xf32>, tensor<f32>) outs(%3 : tensor<1x1001xf32>) {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
- %10 = arith.divf %cst_0, %arg1 : f32
- %11 = arith.mulf %arg0, %10 : f32
- linalg.yield %11 : f32
- } -> tensor<1x1001xf32>
- flow.dispatch.tensor.store %9, %2, offsets = [0, 0], sizes = [1, 1001], strides = [1, 1] : tensor<1x1001xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x1001xf32>>
- return
- }
+func.func @reduction_ew() {
+ %c5120 = arith.constant 5120 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %cst_0 = arith.constant 1.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c5120) : !flow.dispatch.tensor<readonly:tensor<1001xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c5120) : !flow.dispatch.tensor<readonly:tensor<1x1001xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1001xf32>>
+ %3 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1001], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x1001xf32>> -> tensor<1x1001xf32>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1001], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1001xf32>> -> tensor<1001xf32>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 1001], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1001xf32>> -> tensor<1x1001xf32>
+ %6 = bufferization.alloc_tensor() : tensor<f32>
+ %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<f32>) -> tensor<f32>
+ %8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%4 : tensor<1001xf32>) outs(%7 : tensor<f32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0]]>} {
+ ^bb0(%arg0: f32, %arg1: f32):
+ %10 = arith.addf %arg0, %arg1 : f32
+ linalg.yield %10 : f32
+ } -> tensor<f32>
+ %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5, %8 : tensor<1x1001xf32>, tensor<f32>) outs(%3 : tensor<1x1001xf32>) {
+ ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
+ %10 = arith.divf %cst_0, %arg1 : f32
+ %11 = arith.mulf %arg0, %10 : f32
+ linalg.yield %11 : f32
+ } -> tensor<1x1001xf32>
+ flow.dispatch.tensor.store %9, %2, offsets = [0, 0], sizes = [1, 1001], strides = [1, 1] : tensor<1x1001xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x1001xf32>>
+ return
}
// CHECK: func.func @reduction_ew
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_configs.mlir b/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_configs.mlir
index c27b743..2a6ff51 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_configs.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_configs.mlir
@@ -1,47 +1,28 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs)))' --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-codegen-materialize-user-configs)' --split-input-file %s | FileCheck %s
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[64, 64, 0], [32, 32, 0], [0, 0, 32], [0, 0, 0]]>,
- translation_info = <CPUDoubleTilingExpert>>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @preset_config_matmul_tensors {
- hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {target_triple="x86_64-xyz-xyz"}>) {
- hal.executable.export @preset_config layout(#pipeline_layout)
- builtin.module {
- func.func @preset_config() {
- %cst = arith.constant 0.000000e+00 : f32
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<256x512xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [128, 256], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [256, 512], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x512xf32>> -> tensor<256x512xf32>
- %init = tensor.empty() : tensor<128x512xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<128x512xf32>) -> tensor<128x512xf32>
- %gemm = linalg.matmul {compilation_info = #compilation}
- ins(%lhs, %rhs : tensor<128x256xf32>, tensor<256x512xf32>)
- outs(%fill : tensor<128x512xf32>) -> tensor<128x512xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [128, 512], strides = [1, 1]
- : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [32, 32, 0], [0, 0, 32], [0, 0, 0]]>
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {target_triple = "x86_64-xyz-xyz"}>
+#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @preset_config() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x512xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x512xf32>> -> tensor<256x512xf32>
+ %5 = tensor.empty() : tensor<128x512xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x512xf32>) -> tensor<128x512xf32>
+ %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<128x256xf32>, tensor<256x512xf32>) outs(%6 : tensor<128x512xf32>) -> tensor<128x512xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [32, 32, 0], [0, 0, 32], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export
+// CHECK: func.func @preset_config()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK: func.func @preset_config
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/reconcile_translation_info.mlir b/compiler/src/iree/compiler/Codegen/Common/test/reconcile_translation_info.mlir
new file mode 100644
index 0000000..5f53d4b
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/test/reconcile_translation_info.mlir
@@ -0,0 +1,146 @@
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-reconcile-translation-info)))" %s --verify-diagnostics | FileCheck %s
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>]>]>
+hal.executable private @err_multiple_entry_point {
+ // expected-error @+1 {{reconciliation for multiple export ops unsupported}}
+ hal.executable.variant public @reconcile_workgroup_size target(#hal.executable.target<"", "", {}>) {
+ hal.executable.export public @entry_point1 layout(#pipeline_layout)
+ hal.executable.export public @entry_point2 layout(#pipeline_layout)
+ }
+}
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>]>]>
+hal.executable private @reconcile_workgroup_size {
+ hal.executable.variant public @reconcile_workgroup_size target(#hal.executable.target<"", "", {}>) {
+ hal.executable.export public @entry_point layout(#pipeline_layout)
+ builtin.module {
+ func.func @fn1() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [4]>} {
+ return
+ }
+ func.func @fn2() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [4]>} {
+ return
+ }
+ }
+ }
+}
+// CHECK-LABEL: hal.executable private @reconcile_workgroup_size
+// CHECK: hal.executable.export public @entry_point
+// CHECK-SAME: workgroup_size = [4 : index, 1 : index, 1 : index]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>]>]>
+hal.executable private @single_translation_info {
+ hal.executable.variant public @single_translation_info target(#hal.executable.target<"", "", {}>) {
+ hal.executable.export public @entry_point layout(#pipeline_layout)
+ builtin.module {
+ func.func @fn1() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [4]>} {
+ return
+ }
+ func.func @fn2() {
+ return
+ }
+ }
+ }
+}
+// CHECK-LABEL: hal.executable private @single_translation_info
+// CHECK: hal.executable.export public @entry_point
+// CHECK-SAME: workgroup_size = [4 : index, 1 : index, 1 : index]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>]>]>
+hal.executable private @err_mistmatched_workgroup_size {
+ hal.executable.variant public @err_mismatched_workgroup_size target(#hal.executable.target<"", "", {}>) {
+ // expected-error @+1 {{failed to reconcile workgroup sizes}}
+ hal.executable.export public @entry_point layout(#pipeline_layout)
+ builtin.module {
+ func.func @fn1() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [4]>} {
+ return
+ }
+ func.func @fn2() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [4, 2]>} {
+ return
+ }
+ }
+ }
+}
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>]>]>
+hal.executable private @err_mistmatched_workgroup_size2 {
+ hal.executable.variant public @err_mismatched_workgroup_size2 target(#hal.executable.target<"", "", {}>) {
+ // expected-error @+1 {{failed to reconcile workgroup sizes}}
+ hal.executable.export public @entry_point layout(#pipeline_layout)
+ builtin.module {
+ func.func @fn1() attributes {translation_info = #iree_codegen.translation_info<None workgroup_size = [4]>} {
+ return
+ }
+ func.func @fn2() attributes {translation_info = #iree_codegen.translation_info<None>} {
+ return
+ }
+ }
+ }
+}
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>]>]>
+hal.executable private @reconcile_subgroup_size {
+ hal.executable.variant public @reconcile_subgroup_size target(#hal.executable.target<"", "", {}>) {
+ hal.executable.export public @entry_point layout(#pipeline_layout)
+ builtin.module {
+ func.func @fn1() attributes {translation_info = #iree_codegen.translation_info<None subgroup_size = 32>} {
+ return
+ }
+ func.func @fn2() attributes {translation_info = #iree_codegen.translation_info<None subgroup_size = 32>} {
+ return
+ }
+ }
+ }
+}
+// CHECK-LABEL: hal.executable private @reconcile_subgroup_size
+// CHECK: hal.executable.export public @entry_point
+// CHECK-SAME: subgroup_size = 32 : index
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>]>]>
+hal.executable private @err_reconcile_subgroup_size {
+ hal.executable.variant public @err_reconcile_subgroup_size target(#hal.executable.target<"", "", {}>) {
+ hal.executable.export public @entry_point layout(#pipeline_layout)
+ builtin.module {
+ func.func @fn1() attributes {translation_info = #iree_codegen.translation_info<None subgroup_size = 32>} {
+ return
+ }
+ func.func @fn2() attributes {translation_info = #iree_codegen.translation_info<None subgroup_size = 32>} {
+ return
+ }
+ }
+ }
+}
+// CHECK-LABEL: hal.executable private @err_reconcile_subgroup_size
+// CHECK: hal.executable.export public @entry_point
+// CHECK-SAME: subgroup_size = 32 : index
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>]>]>
+hal.executable private @waves_per_eu {
+ hal.executable.variant public @waves_per_eu target(#hal.executable.target<"", "", {}>) {
+ hal.executable.export public @entry_point layout(#pipeline_layout)
+ builtin.module {
+ func.func @fn1() attributes {translation_info = #iree_codegen.translation_info<None, {waves_per_eu = 2}>} {
+ return
+ }
+ func.func @fn2() attributes {translation_info = #iree_codegen.translation_info<None, {waves_per_eu = 4}>} {
+ return
+ }
+ }
+ }
+}
+// CHECK-LABEL: hal.executable private @waves_per_eu
+// CHECK: func.func @fn1() attributes {waves_per_eu = 2 : index}
+// CHECK: func.func @fn2() attributes {waves_per_eu = 4 : index}
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/remove_trivial_loops.mlir b/compiler/src/iree/compiler/Codegen/Common/test/remove_trivial_loops.mlir
index afc09eb..84ab767 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/remove_trivial_loops.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/remove_trivial_loops.mlir
@@ -6,19 +6,17 @@
#hal.descriptor_set.binding<1, storage_buffer>
]>
]>
-
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [64, 1, 1]>
// CHECK-LABEL: func.func @dispatch_0()
hal.executable private @dispatch_0 {
hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @dispatch_0 layout(#pipeline_layout) attributes {
- workgroup_size = [64: index, 1: index, 1:index]
- } {
+ hal.executable.export @dispatch_0 layout(#pipeline_layout) {
^bb0(%arg0: !hal.device) :
%c1 = arith.constant 1 : index
hal.return %c1, %c1, %c1 : index, index, index
}
builtin.module {
- func.func @dispatch_0() {
+ func.func @dispatch_0() attributes {translation_info = #translation_info} {
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
// CHECK: %[[C250:.+]] = arith.constant 250 : index
@@ -58,19 +56,17 @@
]>
// CHECK-LABEL: func.func @workgroup_tile_loop()
-#translation = #iree_codegen.translation_info<LLVMGPUDistribute>
+#translation = #iree_codegen.translation_info<LLVMGPUDistribute workgroup_size = [32, 1, 1]>
hal.executable private @workgroup_tile_loop {
hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @workgroup_tile_loop layout(#pipeline_layout) attributes {
- translation_info = #translation
- } {
+ hal.executable.export @workgroup_tile_loop layout(#pipeline_layout) {
^bb0(%arg0 : !hal.device, %arg1 : index):
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
hal.return %c64, %c1, %c1 : index, index, index
}
builtin.module {
- func.func @workgroup_tile_loop() {
+ func.func @workgroup_tile_loop() attributes {translation_info = #translation} {
%c2048 = arith.constant 2048 : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
@@ -100,16 +96,14 @@
#translation = #iree_codegen.translation_info<LLVMGPUDistribute>
hal.executable private @workgroup_tile_loop_negative {
hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @workgroup_tile_loop_negative layout(#pipeline_layout) attributes {
- translation_info = #translation
- } {
+ hal.executable.export @workgroup_tile_loop_negative layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1 : index):
%c1 = arith.constant 1 : index
%0 = affine.apply affine_map<(d0) -> (d0 ceildiv 16)>(%arg1)
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module {
- func.func @workgroup_tile_loop_negative() {
+ func.func @workgroup_tile_loop_negative() attributes {translation_info = #translation} {
%c2048 = arith.constant 2048 : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
@@ -138,13 +132,10 @@
// CHECK-LABEL: func.func @both_workgroup_and_workitem()
// CHECK-NOT: scf.for
// CHECK: gpu.barrier
-#translation = #iree_codegen.translation_info<LLVMGPUDistribute>
+#translation = #iree_codegen.translation_info<LLVMGPUDistribute workgroup_size = [8, 2, 1]>
hal.executable private @both_workgroup_and_workitem {
hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @both_workgroup_and_workitem layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [8: index, 2: index, 1: index]
- } {
+ hal.executable.export @both_workgroup_and_workitem layout(#pipeline_layout) {
^bb0(%arg0 : !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
%c1 = arith.constant 1 : index
%c14 = arith.constant 14 : index
@@ -152,7 +143,7 @@
hal.return %c1, %c14, %c112 : index, index, index
}
builtin.module {
- func.func @both_workgroup_and_workitem() {
+ func.func @both_workgroup_and_workitem() attributes {translation_info = #translation} {
%c8 = arith.constant 8 : index
%c32 = arith.constant 32 : index
%c112 = arith.constant 112 : index
@@ -204,14 +195,14 @@
#map3 = affine_map<(d0)[s0] -> (d0 + s0)>
hal.executable private @simple_mul {
hal.executable.variant public @variant target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export public @simple_mul ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @simple_mul ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module {
- func.func @simple_mul() {
+ func.func @simple_mul() attributes {translation_info = #translation} {
%cst = arith.constant 0.000000e+00 : f32
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
index 0565b04..17f9b15 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
@@ -1,6 +1,6 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-tile-and-distribute-to-workgroups)), canonicalize, cse)' --split-input-file %s | FileCheck %s
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-tile-and-distribute-to-workgroups{max-workgroup-parallel-dims=1})), canonicalize, cse)' --split-input-file %s | FileCheck %s -check-prefix=CHECKW
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-tile-and-distribute-to-workgroups{distribution-method=2})), canonicalize, cse)' --split-input-file %s | FileCheck %s -check-prefix=NO-LOOP
+// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-tile-and-distribute-to-workgroups, canonicalize)), cse)))' --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-tile-and-distribute-to-workgroups{max-workgroup-parallel-dims=1}, canonicalize)), cse)))' --split-input-file %s | FileCheck %s -check-prefix=CHECKW
+// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-tile-and-distribute-to-workgroups{distribution-method=2})), canonicalize, cse)))' --split-input-file %s | FileCheck %s -check-prefix=NO-LOOP
#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [16, 4, 0], [0, 0, 64]]>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
@@ -18,13 +18,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @matmul_tensors {
hal.executable.variant public @llvm target(#executable_target_embedded_elf_arm_64_) {
- hal.executable.export public @matmul_tensors layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @matmul_tensors layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @matmul_tensors() {
+ func.func @matmul_tensors() attributes {translation_info = #translation} {
%cl_0 = hal.interface.constant.load[0] : index
%cl_1 = hal.interface.constant.load[1] : index
%cl_2 = hal.interface.constant.load[2] : index
@@ -59,7 +59,6 @@
// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 64)>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.export public @matmul_tensors
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device,
// CHECK-SAME: %[[WORKLOAD_M:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[WORKLOAD_N:[a-zA-Z0-9_]+]]: index
@@ -69,6 +68,7 @@
// CHECK-DAG: %[[D1:.+]] = affine.apply #[[MAP0]]()[%[[WORKLOAD_N]]]
// CHECK: hal.return %[[D1]], %[[D0]], %[[C1]] : index, index, index
// CHECK: func.func @matmul_tensors()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0]
// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1]
// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2]
@@ -117,13 +117,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @add {
hal.executable.variant public @llvm target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @add layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @add layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @add() {
+ func.func @add() attributes {translation_info = #translation} {
%cl_0 = hal.interface.constant.load[0] : index
%cl_1 = hal.interface.constant.load[1] : index
%0 = flow.dispatch.workload.ordinal %cl_0, 0 : index
@@ -158,7 +158,6 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable private @add
// CHECK: hal.executable.export public @add
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device,
// CHECK-SAME: %[[WORKLOAD_0:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[WORKLOAD_1:[a-zA-Z0-9_]+]]: index)
@@ -167,6 +166,7 @@
// CHECK-DAG: %[[D1:.+]] = affine.apply #[[MAP]]()[%[[WORKLOAD_1]]]
// CHECK: hal.return %[[D1]], %[[D0]], %[[C1]] : index, index, index
// CHECK: func.func @add()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: scf.for %[[IV1:.+]] =
// CHECK: %[[RESULT:.+]] = linalg.generic
@@ -191,13 +191,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @add4D {
hal.executable.variant public @llvm target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @add4D layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @add4D layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index, %arg4 :index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @add4D() {
+ func.func @add4D() attributes {translation_info = #translation} {
%cl_0 = hal.interface.constant.load[0] : index
%cl_1 = hal.interface.constant.load[1] : index
%cl_2 = hal.interface.constant.load[2] : index
@@ -234,7 +234,6 @@
// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 64)>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.export public @add4D
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device,
// CHECK-SAME: %[[WORKLOAD_0:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[WORKLOAD_1:[a-zA-Z0-9_]+]]: index
@@ -245,6 +244,7 @@
// CHECK-DAG: %[[D2:.+]] = affine.apply #[[MAP]]()[%[[WORKLOAD_3]]]
// CHECK: hal.return %[[D2]], %[[D1]], %[[D0]] : index, index, index
// CHECK: func.func @add4D()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: scf.for %[[IV1:.+]] =
// CHECK: scf.for %[[IV2:.+]] =
@@ -271,13 +271,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @add_distribute4D {
hal.executable.variant public @llvm target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @add_distribute4D layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @add_distribute4D layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index, %arg4 :index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @add_distribute4D() {
+ func.func @add_distribute4D() attributes {translation_info = #translation} {
%cl_0 = hal.interface.constant.load[0] : index
%cl_1 = hal.interface.constant.load[1] : index
%cl_2 = hal.interface.constant.load[2] : index
@@ -324,7 +324,6 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.export public @add_distribute4D
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device,
// CHECK-SAME: %[[WORKLOAD_0:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[WORKLOAD_1:[a-zA-Z0-9_]+]]: index
@@ -334,7 +333,8 @@
// CHECK-DAG: %[[D1:.+]] = affine.apply #[[MAP]]()[%[[WORKLOAD_2]]]
// CHECK-DAG: %[[D2:.+]] = affine.apply #[[MAP]]()[%[[WORKLOAD_3]]]
// CHECK: hal.return %[[D2]], %[[D1]], %[[D0]] : index, index, index
-// CHECK: func.func @add_distribute4D() {
+// CHECK: func.func @add_distribute4D()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-DAG: %[[D0:.*]] = hal.interface.constant.load[0] : index
// CHECK-DAG: %[[D1:.*]] = hal.interface.constant.load[1] : index
// CHECK-DAG: %[[D2:.*]] = hal.interface.constant.load[2] : index
@@ -392,13 +392,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @add_distribute4D_zero_tile_size {
hal.executable.variant public @llvm target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @add_distribute4D_zero_tile_size layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @add_distribute4D_zero_tile_size layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index, %arg4 :index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @add_distribute4D_zero_tile_size() {
+ func.func @add_distribute4D_zero_tile_size() attributes {translation_info = #translation} {
%cl_0 = hal.interface.constant.load[0] : index
%cl_1 = hal.interface.constant.load[1] : index
%cl_2 = hal.interface.constant.load[2] : index
@@ -437,7 +437,6 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.export public @add_distribute4D_zero_tile_size
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device,
// CHECK-SAME: %[[WORKLOAD_0:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[WORKLOAD_1:[a-zA-Z0-9_]+]]: index
@@ -448,6 +447,7 @@
// CHECK-DAG: %[[D2:.+]] = affine.apply #[[MAP]]()[%[[WORKLOAD_3]]]
// CHECK: hal.return %[[D2]], %[[D1]], %[[D0]] : index, index, index
// CHECK: func.func @add_distribute4D_zero_tile_size()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// -----
@@ -466,13 +466,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @batch_matmul_tensors {
hal.executable.variant public @llvm target(#executable_target_embedded_elf_arm_64_) {
- hal.executable.export public @batch_matmul_tensors layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @batch_matmul_tensors layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index, %arg4 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @batch_matmul_tensors() {
+ func.func @batch_matmul_tensors() attributes {translation_info = #translation} {
%cst = arith.constant 0.000000e+00 : f32
%cl_0 = hal.interface.constant.load[0] : index
%cl_1 = hal.interface.constant.load[1] : index
@@ -537,13 +537,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @preset_config_matmul_tensors {
hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
- hal.executable.export public @preset_config layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @preset_config layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @preset_config() {
+ func.func @preset_config() attributes {translation_info = #translation} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
: !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
@@ -568,7 +568,6 @@
}
// CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 32)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 * 16)>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.export public @preset_config
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device)
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
@@ -601,13 +600,13 @@
#translation = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize>
hal.executable public @copy_op {
hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
- hal.executable.export public @copy_op layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @copy_op layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3: index, %arg4 : index, %arg5: index, %arg6 : index, %arg7: index, %arg8 : index, %arg9: index, %arg10: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @copy_op() {
+ func.func @copy_op() attributes {translation_info = #translation} {
%cl_0 = hal.interface.constant.load[0] : index
%cl_1 = hal.interface.constant.load[1] : index
%cl_2 = hal.interface.constant.load[2] : index
@@ -651,12 +650,12 @@
// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 64)>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize>
// CHECK: hal.executable.export public @copy_op
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: %[[C1:.+]] = arith.constant 1 : index
// CHECK: %[[D0:.+]] = affine.apply #[[MAP0]]()[%{{.+}}]
// CHECK: %[[D1:.+]] = affine.apply #[[MAP0]]()[%{{.+}}]
// CHECK: hal.return %[[D1]], %[[D0]], %[[C1]]
// CHECK: func.func @copy_op()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-DAG: %[[SOURCE_SIZE_Y:.+]] = hal.interface.constant.load[0] : index
// CHECK-DAG: %[[SOURCE_SIZE_X:.+]] = hal.interface.constant.load[1] : index
// CHECK-DAG: %[[DEST_SIZE_Y:.+]] = hal.interface.constant.load[2] : index
@@ -702,13 +701,13 @@
#translation = #iree_codegen.translation_info<CPUDefault>
hal.executable private @static_1d_fft_stage2 {
hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
- hal.executable.export public @static_1d_fft_stage2 layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @static_1d_fft_stage2 layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @static_1d_fft_stage2() {
+ func.func @static_1d_fft_stage2() attributes {translation_info = #translation} {
%c2 = arith.constant 2 : index
%cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32>
%cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32>
@@ -734,11 +733,11 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDefault>
// CHECK: hal.executable private @static_1d_fft_stage2
// CHECK: hal.executable.export public @static_1d_fft_stage2
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK: hal.return %[[C3]], %[[C1]], %[[C1]] : index, index, index
// CHECK: func.func @static_1d_fft_stage2()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: %[[RESULT:.+]]:2 = iree_linalg_ext.fft
// CHECK-DAG: flow.dispatch.tensor.store %[[RESULT]]#0, %{{.+}}, offsets = [%[[IV0]]]
@@ -757,13 +756,13 @@
#translation = #iree_codegen.translation_info<CPUDefault>
hal.executable private @static_3d_fft_stage3 {
hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
- hal.executable.export public @static_3d_fft_stage3 layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @static_3d_fft_stage3 layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @static_3d_fft_stage3() {
+ func.func @static_3d_fft_stage3() attributes {translation_info = #translation} {
%c3 = arith.constant 3 : index
%cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32>
%cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32>
@@ -781,11 +780,11 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDefault>
// CHECK: hal.executable private @static_3d_fft_stage3
// CHECK: hal.executable.export public @static_3d_fft_stage3
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
// CHECK: hal.return %[[C1]], %[[C2]], %[[C1]] : index, index, index
// CHECK: func.func @static_3d_fft_stage3()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: scf.for %[[IV1:.+]] =
// CHECK-DAG: %[[SUBVIEW1:.+]] = memref.subview %{{.+}}[0, %[[IV0]], %[[IV1]]]
@@ -813,13 +812,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @outs_fusion {
hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
- hal.executable.export public @outs_fusion_fn layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @outs_fusion_fn layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @outs_fusion_fn() {
+ func.func @outs_fusion_fn() attributes {translation_info = #translation} {
%cst = arith.constant 0.000000e+00 : f32
%cl_0 = hal.interface.constant.load[0] : index
%cl_1 = hal.interface.constant.load[1] : index
@@ -861,7 +860,6 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable private @outs_fusion
// CHECK: hal.executable.export public @outs_fusion_fn
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device,
// CHECK-SAME: %[[WORKLOAD_0:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[WORKLOAD_1:[a-zA-Z0-9_]+]]: index
@@ -871,6 +869,7 @@
// CHECK-DAG: %[[D1:.+]] = affine.apply #[[MAP]]()[%[[WORKLOAD_1]]]
// CHECK: hal.return %[[D1]], %[[D0]], %[[C1]] : index, index, index
// CHECK: func.func @outs_fusion_fn
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: scf.for %[[IV1:.+]] =
// CHECK: %[[INIT:.+]] = tensor.empty
@@ -897,13 +896,13 @@
#translation = #iree_codegen.translation_info<CPUDefault>
hal.executable private @conv {
hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
- hal.executable.export public @conv layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @conv layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index, %arg7 : index, %arg8 : index, %arg9 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @conv() {
+ func.func @conv() attributes {translation_info = #translation} {
%cl_0 = hal.interface.constant.load[0] : index
%cl_1 = hal.interface.constant.load[1] : index
%cl_2 = hal.interface.constant.load[2] : index
@@ -947,7 +946,6 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDefault>
// CHECK: hal.executable private @conv
// CHECK: hal.executable.export public @conv
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device,
// CHECK-SAME: %[[WORKLOAD_0:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[WORKLOAD_1:[a-zA-Z0-9_]+]]: index
@@ -961,6 +959,7 @@
// CHECK-DAG: %[[D2:.+]] = affine.apply #[[MAP]]()[%[[WORKLOAD_3]]]
// CHECK: hal.return %[[D2]], %[[D1]], %[[D0]] : index, index, index
// CHECK: func.func @conv()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: scf.for %[[IV1:.+]] =
// CHECK: scf.for %[[IV2:.+]] =
@@ -987,13 +986,13 @@
#translation = #iree_codegen.translation_info<CPUDefault>
hal.executable private @conv_static {
hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
- hal.executable.export public @conv_static layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @conv_static layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @conv_static() {
+ func.func @conv_static() attributes {translation_info = #translation} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
: !flow.dispatch.tensor<readonly:tensor<1x161x161x96xf32>>
@@ -1022,7 +1021,6 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDefault>
// CHECK: hal.executable private @conv_static
// CHECK: hal.executable.export public @conv_static
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device,
// CHECK-SAME: %[[WORKLOAD_0:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[WORKLOAD_1:[a-zA-Z0-9_]+]]: index
@@ -1034,6 +1032,7 @@
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
// CHECK: hal.return %[[C2]], %[[C2]], %[[C4]] : index, index, index
// CHECK: func.func @conv_static()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: scf.for %[[IV1:.+]] =
// CHECK: scf.for %[[IV2:.+]] =
@@ -1062,13 +1061,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @generic_static {
hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
- hal.executable.export public @generic_static layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @generic_static layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @generic_static() {
+ func.func @generic_static() attributes {translation_info = #translation} {
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
: !flow.dispatch.tensor<readonly:tensor<96x16xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
@@ -1093,11 +1092,11 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable private @generic_static
// CHECK: hal.executable.export public @generic_static
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
// CHECK: hal.return %[[C3]], %[[C1]], %[[C1]] : index, index, index
// CHECK: func.func @generic_static()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: %[[RESULT:.+]] = linalg.generic
// CHECK: flow.dispatch.tensor.store %[[RESULT]], %{{.+}}, offsets = [0, %[[IV0]]]
@@ -1128,13 +1127,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @matmul_static {
hal.executable.variant public @system_elf_arm_64 target(#executable_target_system_elf_arm_64_) {
- hal.executable.export public @matmul_static layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @matmul_static layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @matmul_static() {
+ func.func @matmul_static() attributes {translation_info = #translation} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
: !flow.dispatch.tensor<readonly:tensor<196x240xf32>>
@@ -1159,10 +1158,8 @@
}
// CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 28)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 * 8)>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable private @matmul_static
// CHECK: hal.executable.export public @matmul_static
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C7:.+]] = arith.constant 7 : index
// CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index
@@ -1185,13 +1182,13 @@
#translation = #iree_codegen.translation_info<CPUDefault>
hal.executable private @restrict_num_workgroups {
hal.executable.variant public @system_elf_arm_64 target(#executable_target_system_elf_arm_64_) {
- hal.executable.export public @restrict_num_workgroups layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @restrict_num_workgroups layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @restrict_num_workgroups() {
+ func.func @restrict_num_workgroups() attributes {translation_info = #translation} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
: !flow.dispatch.tensor<readonly:tensor<1x11x11x576xf32>>
@@ -1215,10 +1212,8 @@
}
}
// CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 * 64)>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDefault>
// CHECK: hal.executable private @restrict_num_workgroups
// CHECK: hal.executable.export public @restrict_num_workgroups
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C7:.+]] = arith.constant 7 : index
// CHECK-DAG: %[[C9:.+]] = arith.constant 9 : index
@@ -1251,7 +1246,7 @@
}
builtin.module {
func.func @reduction(%arg0 : !flow.dispatch.tensor<readonly:tensor<7x7x2048xf32>>,
- %arg1 : !flow.dispatch.tensor<writeonly:tensor<7xf32>>) {
+ %arg1 : !flow.dispatch.tensor<writeonly:tensor<7xf32>>) attributes {translation_info = #translation} {
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant 1.000000e+01 : f32
%0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [7, 7, 2048], strides = [1, 1, 1]
@@ -1283,11 +1278,11 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable private @reduction
// CHECK: hal.executable.export public @reduction
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
// CHECK: hal.return %[[C2]], %[[C1]], %[[C1]] : index, index, index
// CHECK: func.func @reduction
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: %[[INIT:.+]] = tensor.empty
// CHECK: %[[FILL:.+]] = linalg.fill
@@ -1314,13 +1309,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @gemm_unit_N {
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @gemm_unit_N ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @gemm_unit_N ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @gemm_unit_N() {
+ func.func @gemm_unit_N() attributes {translation_info = #translation} {
%c0 = arith.constant 0 : index
%cl_0 = hal.interface.constant.load[0] : index
%cl_1 = hal.interface.constant.load[1] : index
@@ -1351,7 +1346,6 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable private @gemm_unit_N
// CHECK: hal.executable.export public @gemm_unit_N
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device,
// CHECK-SAME: %[[WORKLOAD_0:[a-zA-Z0-9_]+]]: index,
// CHECK-SAME: %[[WORKLOAD_1:[a-zA-Z0-9_]+]]: index)
@@ -1359,6 +1353,7 @@
// CHECK-DAG: %[[D0:.+]] = affine.apply #[[MAP0]]()[%[[WORKLOAD_0]]
// CHECK: hal.return %[[D0]], %[[C1]], %[[C1]] : index, index, index
// CHECK: func.func @gemm_unit_N()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0]
// CHECK-DAG: %[[WG_ID_X:.+]] = hal.interface.workgroup.id[0]
// CHECK-DAG: %[[WG_COUNT_X:.+]] = hal.interface.workgroup.count[0]
@@ -1386,13 +1381,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @gemm_unit_M_unit_N {
hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @gemm_unit_M_unit_N ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @gemm_unit_M_unit_N ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @gemm_unit_M_unit_N() {
+ func.func @gemm_unit_M_unit_N() attributes {translation_info = #translation} {
%c0 = arith.constant 0 : index
%0 = hal.interface.constant.load[0] : index
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0)
@@ -1419,11 +1414,11 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable private @gemm_unit_M_unit_N
// CHECK: hal.executable.export public @gemm_unit_M_unit_N
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device)
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK: hal.return %[[C1]], %[[C1]], %[[C1]] : index, index, index
// CHECK: func.func @gemm_unit_M_unit_N()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NOT: scf.for
// CHECK: %[[GEMM:.+]] = linalg.matmul
// CHECK: flow.dispatch.tensor.store %[[GEMM]], %{{.+}}, offsets = [0, 0]
@@ -1446,13 +1441,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @generic_unit_dims {
hal.executable.variant public @llvm target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @generic_unit_dims layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @generic_unit_dims layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index, %arg4 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @generic_unit_dims() {
+ func.func @generic_unit_dims() attributes {translation_info = #translation} {
%cl_0 = hal.interface.constant.load[0] : index
%cl_1 = hal.interface.constant.load[1] : index
%cl_2 = hal.interface.constant.load[2] : index
@@ -1486,7 +1481,6 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable private @generic_unit_dims
// CHECK: hal.executable.export public @generic_unit_dims
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device,
// CHECK-SAME: %[[WORKLOAD_0:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[WORKLOAD_1:[a-zA-Z0-9_]+]]: index
@@ -1497,6 +1491,7 @@
// CHECK-DAG: %[[D2:.+]] = affine.apply #[[MAP0]]()[%[[WORKLOAD_3]]]
// CHECK: hal.return %[[D2]], %[[D1]], %[[D0]] : index, index, index
// CHECK: func.func @generic_unit_dims()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: scf.for %[[IV1:.+]] =
// CHECK: scf.for %[[IV2:.+]] =
@@ -1522,13 +1517,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @reduce_to_scalar {
hal.executable.variant public @llvm target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @reduce_to_scalar layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @reduce_to_scalar layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @reduce_to_scalar() {
+ func.func @reduce_to_scalar() attributes {translation_info = #translation} {
%cl_0 = hal.interface.constant.load[0] : index
%0 = flow.dispatch.workload.ordinal %cl_0, 0 : index
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
@@ -1556,12 +1551,12 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable private @reduce_to_scalar
// CHECK: hal.executable.export public @reduce_to_scalar
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device,
// CHECK-SAME: %[[WORKLOAD_0:[a-zA-Z0-9_]+]]: index)
// CHECK: %[[C1:.+]] = arith.constant 1 : index
// CHECK: hal.return %[[C1]], %[[C1]], %[[C1]] : index, index, index
// CHECK: func.func @reduce_to_scalar()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NOT: scf.for
// -----
@@ -1582,13 +1577,13 @@
#translation = #iree_codegen.translation_info<CPUDefault>
hal.executable private @scalar {
hal.executable.variant public @llvm target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @scalar layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @scalar layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @scalar() {
+ func.func @scalar() attributes {translation_info = #translation} {
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
: !flow.dispatch.tensor<readonly:tensor<f32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
@@ -1615,11 +1610,11 @@
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDefault>
// CHECK: hal.executable private @scalar
// CHECK: hal.executable.export public @scalar
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device)
// CHECK: %[[C1:.+]] = arith.constant 1 : index
// CHECK: hal.return %[[C1]], %[[C1]], %[[C1]] : index, index, index
// CHECK: func.func @scalar()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NOT: scf.for
// -----
@@ -1639,13 +1634,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @rank_reduced_slice {
hal.executable.variant public @llvm target(#executable_target_embedded_elf_arm_64_) {
- hal.executable.export public @rank_reduced_slice layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @rank_reduced_slice layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @rank_reduced_slice() {
+ func.func @rank_reduced_slice() attributes {translation_info = #translation} {
%in_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
: !flow.dispatch.tensor<readonly:tensor<5x40xf32>>
%out_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
@@ -1672,12 +1667,12 @@
// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 + 10)>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.export public @rank_reduced_slice
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: %[[WORKLOAD:[a-zA-Z0-9]+]]: index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1
// CHECK-DAG: %[[C5:.+]] = arith.constant 5
// CHECK: hal.return %[[C5]], %[[C1]], %[[C1]]
// CHECK: func.func @rank_reduced_slice()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-DAG: %[[SRC_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0)
// CHECK-SAME: : !flow.dispatch.tensor<readonly:tensor<5x40xf32>>
// CHECK-DAG: %[[DST_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1)
@@ -1707,13 +1702,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @matmul_interchange {
hal.executable.variant public @llvm target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @matmul_interchange layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @matmul_interchange layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @matmul_interchange() {
+ func.func @matmul_interchange() attributes {translation_info = #translation} {
%cl_0 = hal.interface.constant.load[0] : index
%cl_1 = hal.interface.constant.load[1] : index
%cl_2 = hal.interface.constant.load[2] : index
@@ -1747,7 +1742,6 @@
// CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 64)>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.export public @matmul_interchange
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device,
// CHECK-SAME: %[[WORKLOAD_0:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[WORKLOAD_1:[a-zA-Z0-9_]+]]: index
@@ -1757,6 +1751,7 @@
// CHECK-DAG: %[[D1:.+]] = affine.apply #[[MAP1]]()[%[[WORKLOAD_1]]]
// CHECK: hal.return %[[D0]], %[[D1]], %[[C1]] : index, index, index
// CHECK: func.func @matmul_interchange()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-DAG: %[[D0:.+]] = hal.interface.constant.load[0] : index
// CHECK-DAG: %[[D1:.+]] = hal.interface.constant.load[1] : index
// CHECK: scf.for %{{.+}} = %{{.+}} to %[[D1]] step %{{.+}} {
@@ -1766,13 +1761,13 @@
hal.executable private @no_compute {
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {}>) {
- hal.executable.export public @no_compute ordinal(0) layout(#hal.pipeline.layout<push_constants = 5, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
+ hal.executable.export public @no_compute ordinal(0) layout(#hal.pipeline.layout<push_constants = 5, sets = [<0, bindings = [<0, storage_buffer>, <1, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4 : index, %arg5 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @no_compute() {
+ func.func @no_compute() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%cl_0 = hal.interface.constant.load[0] : i32
%cl_1 = hal.interface.constant.load[1] : i32
@@ -1809,14 +1804,13 @@
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf_x86_64", {}>) {
hal.executable.export public @tile_multiuse_producer ordinal(0) layout (#hal.pipeline.layout<
push_constants = 0, sets = [<0, bindings = [
- <0, storage_buffer, ReadOnly>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
+ <0, storage_buffer, ReadOnly>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @tile_multiuse_producer() {
+ func.func @tile_multiuse_producer() attributes {translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant 1.000000e+00 : f32
@@ -1906,13 +1900,13 @@
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {}>) {
hal.executable.export public @no_tile ordinal(0) layout(#hal.pipeline.layout<
push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>, <3, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
+ {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @no_tile() {
+ func.func @no_tile() attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
%c0 = arith.constant 0 : index
%c64 = arith.constant 64 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<10xf32>>
@@ -1946,13 +1940,13 @@
hal.executable.export public @gemm_lhs_pack ordinal(0)
layout(#hal.pipeline.layout<push_constants = 0,
sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
+ {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @gemm_lhs_pack() {
+ func.func @gemm_lhs_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0)
@@ -1986,13 +1980,13 @@
hal.executable.export public @gemm_rhs_transpose_pack ordinal(0)
layout(#hal.pipeline.layout<push_constants = 0,
sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
+ {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @gemm_rhs_transpose_pack() {
+ func.func @gemm_rhs_transpose_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
%c0 = arith.constant 0 : index
%c114688 = arith.constant 114688 : index
%cst = arith.constant 0.000000e+00 : f32
@@ -2026,13 +2020,13 @@
hal.executable.export public @clone_index_computations ordinal(0) layout(
#hal.pipeline.layout<push_constants = 4, sets = [
<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
+ {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3 : index, %arg4 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @clone_index_computations() {
+ func.func @clone_index_computations() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%cl_0 = hal.interface.constant.load[0] : i32
@@ -2089,13 +2083,13 @@
hal.executable.export public @dynamic_unpack ordinal(0) layout(
#hal.pipeline.layout<push_constants = 4, sets = [
<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
+ {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @dynamic_unpack() {
+ func.func @dynamic_unpack() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
%c131072 = arith.constant 131072 : index
%c0 = arith.constant 0 : index
%cl_0 = hal.interface.constant.load[0] : i32
@@ -2135,13 +2129,13 @@
hal.executable.export public @dynamic_unpack_dynamic_tile ordinal(0) layout(
#hal.pipeline.layout<push_constants = 4, sets = [
<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
+ {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @dynamic_unpack_dynamic_tile() {
+ func.func @dynamic_unpack_dynamic_tile() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
%c131072 = arith.constant 131072 : index
%c0 = arith.constant 0 : index
%c16 = arith.constant 16 : index
@@ -2183,13 +2177,13 @@
hal.executable.export public @unpack_elem ordinal(0) layout(
#hal.pipeline.layout<push_constants = 0, sets = [
<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
+ {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @unpack_elem() {
+ func.func @unpack_elem() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x48x8x8xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x384xf32>>
@@ -2221,13 +2215,13 @@
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
hal.executable private @dynamic_unpack_fusion {
hal.executable.variant public @vmvx_bytecode_fb target(<"vmvx", "vmvx-bytecode-fb", {ukernels = true}>) {
- hal.executable.export public @dynamic_unpack_fusion ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<VMVXDefault>} {
+ hal.executable.export public @dynamic_unpack_fusion ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @dynamic_unpack_fusion() {
+ func.func @dynamic_unpack_fusion() attributes {translation_info = #iree_codegen.translation_info<VMVXDefault>} {
%c200960 = arith.constant 200960 : index
%c1003776 = arith.constant 1003776 : index
%c1053952 = arith.constant 1053952 : index
@@ -2277,13 +2271,13 @@
hal.executable private @elem_pack {
hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>) {
- hal.executable.export public @elem_pack ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer>, <4, storage_buffer>, <5, storage_buffer>]>]>) attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
+ hal.executable.export public @elem_pack ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer>, <4, storage_buffer>, <5, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @elem_pack() {
+ func.func @elem_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
%c1339392 = arith.constant 1339392 : index
%c0 = arith.constant 0 : index
%c823296 = arith.constant 823296 : index
@@ -2343,13 +2337,13 @@
hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>) {
hal.executable.export public @scatter ordinal(0)
layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<LLVMGPUDistribute>, workgroup_size = [1 : index, 1 : index, 1 : index]} {
+ {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @scatter() {
+ func.func @scatter() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUDistribute workgroup_size = [1, 1, 1]>} {
%c228075520 = arith.constant 228075520 : index
%c251668480 = arith.constant 251668480 : index
%c0 = arith.constant 0 : index
@@ -2430,13 +2424,13 @@
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
hal.executable private @matmul_tensors {
hal.executable.variant public @llvm target(#executable_target_embedded_elf_arm_64_) {
- hal.executable.export public @matmul_tensor_count_from_dag_root layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @matmul_tensor_count_from_dag_root layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @matmul_tensor_count_from_dag_root() {
+ func.func @matmul_tensor_count_from_dag_root() attributes {translation_info = #translation} {
%0 = hal.interface.constant.load[0] : index
%1 = hal.interface.constant.load[1] : index
%2 = hal.interface.constant.load[2] : index
@@ -2466,9 +2460,7 @@
// CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 64)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 * 64)>
// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 64)>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.export public @matmul_tensor_count_from_dag_root
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-NEXT: (%[[DEVICE:.+]]: !hal.device,
// CHECK-SAME: %[[WORKLOAD_M:[a-zA-Z0-9_]+]]: index
// CHECK-SAME: %[[WORKLOAD_N:[a-zA-Z0-9_]+]]: index
@@ -2488,7 +2480,7 @@
module {
hal.executable private @matmul_tensors {
hal.executable.variant public @llvm target(#executable_target_embedded_elf_arm_64_) {
- hal.executable.export public @matmul_already_distributed layout(#pipeline_layout) attributes {translation_info = #translation} {
+ hal.executable.export public @matmul_already_distributed layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map()[%arg1]
@@ -2496,7 +2488,7 @@
hal.return %1, %0, %c1 : index, index, index
}
builtin.module {
- func.func @matmul_already_distributed() {
+ func.func @matmul_already_distributed() attributes {translation_info = #translation} {
%0 = hal.interface.constant.load[0] : index
%1 = hal.interface.constant.load[1] : index
%2 = hal.interface.constant.load[2] : index
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
index fec7bf7..945cad7 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
@@ -59,10 +59,22 @@
TranslationInfoAttr TranslationInfoAttr::get(
MLIRContext *context, DispatchLoweringPassPipeline passPipeline,
- SymbolRefAttr codegenSpec, DictionaryAttr configuration) {
+ SymbolRefAttr codegenSpec, ArrayRef<int64_t> workgroupSize,
+ std::optional<int64_t> subgroupSize, DictionaryAttr configuration) {
auto pipelineAttr =
DispatchLoweringPassPipelineAttr::get(context, passPipeline);
- return get(context, pipelineAttr, codegenSpec, configuration);
+ return get(context, pipelineAttr, codegenSpec, workgroupSize,
+ subgroupSize.value_or(int64_t()), configuration);
+}
+
+TranslationInfoAttr TranslationInfoAttr::get(
+ MLIRContext *context, DispatchLoweringPassPipeline passPipeline,
+ ArrayRef<int64_t> workgroupSize, std::optional<int64_t> subgroupSize,
+ DictionaryAttr configuration) {
+ auto pipelineAttr =
+ DispatchLoweringPassPipelineAttr::get(context, passPipeline);
+ return get(context, pipelineAttr, /*codegenSpec=*/SymbolRefAttr(),
+ workgroupSize, subgroupSize.value_or(int64_t()), configuration);
}
DispatchLoweringPassPipeline
@@ -73,7 +85,8 @@
LogicalResult TranslationInfoAttr::verify(
function_ref<InFlightDiagnostic()> emitError,
IREE::Codegen::DispatchLoweringPassPipelineAttr passPipeline,
- SymbolRefAttr codegenSpec, DictionaryAttr configuration) {
+ SymbolRefAttr codegenSpec, ArrayRef<int64_t> workgroupSize,
+ int64_t subgroupSize, DictionaryAttr configuration) {
if (!passPipeline) {
return emitError() << "missing pass pipeline specification";
}
@@ -89,6 +102,15 @@
<< "transform dialect codegen spec requires pass pipeline : "
<< stringifyEnum(tdPassPipeline);
}
+ if (workgroupSize.size() > 3) {
+ return emitError() << "workgroup size cannot have more than 3 entries";
+ }
+ if (llvm::any_of(workgroupSize, [](int64_t value) { return value <= 0; })) {
+ return emitError() << "workgroup size value has to be greater than zero";
+ }
+ if (subgroupSize < 0) {
+ return emitError() << "subgroup size value cannot be negative";
+ }
return success();
}
@@ -279,10 +301,10 @@
// iree.compilation_info
//===----------------------------------------------------------------------===//
-LogicalResult CompilationInfoAttr::verify(
- function_ref<InFlightDiagnostic()> emitError,
- LoweringConfigAttr loweringConfig, TranslationInfoAttr translationInfo,
- ArrayRef<int64_t> workgroupSize, std::optional<int64_t> subgroupSize) {
+LogicalResult
+CompilationInfoAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+ LoweringConfigAttr loweringConfig,
+ TranslationInfoAttr translationInfo) {
if (!loweringConfig) {
return emitError() << "missing lowering config";
}
@@ -294,10 +316,11 @@
if (!translationInfo) {
return emitError() << "missing translation info";
}
- if (failed(TranslationInfoAttr::verify(emitError,
- translationInfo.getPassPipeline(),
- translationInfo.getCodegenSpec(),
- translationInfo.getConfiguration()))) {
+ if (failed(TranslationInfoAttr::verify(
+ emitError, translationInfo.getPassPipeline(),
+ translationInfo.getCodegenSpec(), translationInfo.getWorkgroupSize(),
+ translationInfo.getSubgroupSize(),
+ translationInfo.getConfiguration()))) {
return failure();
}
return success();
@@ -324,80 +347,46 @@
// ===----------------------------------------------------------------------===//
IREE::Codegen::TranslationInfoAttr
-getTranslationInfo(IREE::HAL::ExecutableExportOp exportOp) {
- return exportOp->getAttrOfType<IREE::Codegen::TranslationInfoAttr>(
+getTranslationInfo(FunctionOpInterface funcOp) {
+ return funcOp->getAttrOfType<IREE::Codegen::TranslationInfoAttr>(
kTranslationInfoAttrName);
}
-std::optional<IREE::Codegen::TranslationInfoAttr>
-getIdenticalTranslationInfo(IREE::HAL::ExecutableVariantOp variantOp) {
- ModuleOp moduleOp = variantOp.getInnerModule();
- if (!moduleOp) {
+std::optional<SmallVector<int64_t>>
+getWorkgroupSize(FunctionOpInterface funcOp) {
+ IREE::Codegen::TranslationInfoAttr translationInfo =
+ getTranslationInfo(funcOp);
+ if (!translationInfo) {
return std::nullopt;
}
-
- std::optional<IREE::Codegen::TranslationInfoAttr> translationInfo;
- for (auto exportOp : variantOp.getExportOps()) {
- IREE::Codegen::TranslationInfoAttr currTranslationInfo =
- getTranslationInfo(exportOp);
- if (!currTranslationInfo) {
- continue;
- }
- if (!translationInfo) {
- translationInfo = currTranslationInfo;
- continue;
- }
- if (currTranslationInfo != translationInfo.value()) {
- return std::nullopt;
- }
- }
-
- return translationInfo;
+ return llvm::to_vector(translationInfo.getWorkgroupSize());
}
-SmallVector<int64_t> getWorkgroupSize(IREE::HAL::ExecutableExportOp exportOp) {
- if (std::optional<ArrayAttr> workgroupSizeAttrList =
- exportOp.getWorkgroupSize()) {
- return llvm::map_to_vector(*workgroupSizeAttrList, [](auto attr) {
- return llvm::cast<IntegerAttr>(attr).getInt();
- });
+std::optional<int64_t> getSubgroupSize(FunctionOpInterface funcOp) {
+ IREE::Codegen::TranslationInfoAttr translationInfo =
+ getTranslationInfo(funcOp);
+ if (!translationInfo) {
+ return std::nullopt;
}
- return {};
-}
-
-std::optional<int64_t> getSubgroupSize(IREE::HAL::ExecutableExportOp exportOp) {
- if (IntegerAttr attr = exportOp.getSubgroupSizeAttr()) {
- return attr.getValue().getSExtValue();
+ // The underlying storage sets 0 to optional scalar integer value. So if set
+ // to 0, return as not set.
+ if (translationInfo.getSubgroupSize() == int64_t()) {
+ return std::nullopt;
}
- return {};
-}
-
-LogicalResult setDispatchConfig(mlir::FunctionOpInterface entryPoint,
- ArrayRef<int64_t> workgroupSize,
- std::optional<int64_t> subgroupSize) {
- FailureOr<IREE::HAL::ExecutableExportOp> exportOp = getEntryPoint(entryPoint);
- if (failed(exportOp))
- return failure();
- MLIRContext *context = exportOp->getContext();
- if (!workgroupSize.empty()) {
- exportOp->setWorkgroupSizeAttr(getIndexArrayAttr(context, workgroupSize));
- }
- if (subgroupSize) {
- exportOp->setSubgroupSizeAttr(Builder(context).getIndexAttr(*subgroupSize));
- }
- return success();
+ return translationInfo.getSubgroupSize();
}
LogicalResult
setTranslationInfo(mlir::FunctionOpInterface entryPoint,
IREE::Codegen::TranslationInfoAttr translationInfo) {
- FailureOr<IREE::HAL::ExecutableExportOp> exportOp = getEntryPoint(entryPoint);
- if (failed(exportOp))
- return failure();
- exportOp.value()->setAttr(kTranslationInfoAttrName, translationInfo);
+ entryPoint->setAttr(kTranslationInfoAttrName, translationInfo);
return success();
}
+void eraseTranslationInfo(FunctionOpInterface funcOp) {
+ funcOp->removeAttr(kTranslationInfoAttrName);
+}
+
//===----------------------------------------------------------------------===//
// Helpers for getting/setting `iree_codegen.lowering_config` attribute on root
// operations.
@@ -448,6 +437,8 @@
op->setAttr(kConfigAttrName, config);
}
+void eraseLoweringConfig(Operation *op) { op->removeAttr(kConfigAttrName); }
+
//===----------------------------------------------------------------------===//
// Helpers for getting/setting `iree_codegen.compilation_info` attribute on root
// operations to override IREEs default compilation.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h
index f23073f..e519c78 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h
@@ -39,49 +39,16 @@
// `hal.executable.export`
//===----------------------------------------------------------------------===//
-/// Gets the translate executable info attribute value associated with
-/// `exportOp`. It expects that the attribute is stored using the identifier
-/// `translation_info`.
+/// Returns the translation info for the `funcOp`. Returns `nullptr` on failure.
IREE::Codegen::TranslationInfoAttr
-getTranslationInfo(IREE::HAL::ExecutableExportOp exportOp);
-/// Returns the translation info for the `funcOp` (by looking at the entry
-/// point). Returns `nullptr` on failure.
-inline IREE::Codegen::TranslationInfoAttr
-getTranslationInfo(mlir::FunctionOpInterface funcOp) {
- FailureOr<IREE::HAL::ExecutableExportOp> exportOp = getEntryPoint(funcOp);
- if (failed(exportOp))
- return nullptr;
- return getTranslationInfo(*exportOp);
-}
-
-/// Returns the identical TranslationInfoAttr. Returns nullptr if entry point
-/// functions have different TranslationInfoAttr.
-/// There might be multiple entry points in the module. Currently, all of them
-/// need to have the same translation info.
-/// TODO(ravishankarm): This is strange that this is not enforced
-/// structurally, but something to address later on. The main issue is how
-/// to invoke separate dynamic pass pipelines on entry point functions,
-/// when the passes might have module level changes. For now this
-/// restriction is fine.
-std::optional<IREE::Codegen::TranslationInfoAttr>
-getIdenticalTranslationInfo(IREE::HAL::ExecutableVariantOp variantOp);
-
-// TODO(ravishankarm, benvanik): Eventually all the information needed for the
-// lowering will be consolidated into a single attribute with richer
-// information.
+getTranslationInfo(mlir::FunctionOpInterface funcOp);
/// Returns the workgroup size specified on the `exportOp`.
-SmallVector<int64_t> getWorkgroupSize(IREE::HAL::ExecutableExportOp exportOp);
+std::optional<SmallVector<int64_t>>
+getWorkgroupSize(mlir::FunctionOpInterface funcOp);
/// Returns the subgroup size specified on the `exportOp`.
-std::optional<int64_t> getSubgroupSize(IREE::HAL::ExecutableExportOp exportOp);
-
-/// Sets and overwrites the dispatch workgroup/subgroup size for the given entry
-/// point function. Returns failure if the given entry point is not exported via
-/// hal.executable.export.
-LogicalResult setDispatchConfig(mlir::FunctionOpInterface entryPoint,
- ArrayRef<int64_t> workgroupSize,
- std::optional<int64_t> subgroupSize);
+std::optional<int64_t> getSubgroupSize(mlir::FunctionOpInterface funcOp);
/// Sets and overwites the translate executable info for the given entry point.
/// Returns failure if the given entry point is not exported via
@@ -90,6 +57,9 @@
setTranslationInfo(mlir::FunctionOpInterface entryPoint,
IREE::Codegen::TranslationInfoAttr translationInfo);
+/// Erases any translation info set on an operation.
+void eraseTranslationInfo(mlir::FunctionOpInterface funcOp);
+
//===----------------------------------------------------------------------===//
// Helpers for getting/setting `iree_codegen.lowering_config` attribute on root
// operations.
@@ -144,10 +114,9 @@
auto config = IREE::Codegen::LoweringConfigAttr::get(context, tileSizes,
scalableTileFlags);
setLoweringConfig(op, config);
- if (failed(setDispatchConfig(entryPointFn, workgroupSize, subgroupSize)))
- return failure();
auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
- entryPointFn.getContext(), passPipeline, SymbolRefAttr(), pipelineConfig);
+ entryPointFn.getContext(), passPipeline, SymbolRefAttr(), workgroupSize,
+ subgroupSize, pipelineConfig);
return setTranslationInfo(entryPointFn, translationInfo);
}
@@ -165,6 +134,9 @@
subgroupSize, pipelineConfig);
}
+/// Function to erase lowering configs that are set on an operation.
+void eraseLoweringConfig(Operation *op);
+
//===----------------------------------------------------------------------===//
// Helpers for getting/setting `iree_codegen.compilation_info` attribute on root
// operations to override IREEs default compilation.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
index d942e33..1f8cd99 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
@@ -136,6 +136,8 @@
let assemblyFormat = [{
`<` `` $passPipeline
(`codegen_spec` `=` $codegenSpec^)?
+ (`workgroup_size` `=` `[` $workgroupSize^ `]`)?
+ (`subgroup_size` `=` $subgroupSize^)?
(`,` $configuration^)? `>`
}];
@@ -144,12 +146,20 @@
"Name of the pipeline to be invoked on the translation unit.">:$passPipeline,
OptionalParameter<"SymbolRefAttr",
"The symbol pointing to the transform dialect codegen spec to be used">:$codegenSpec,
+ OptionalArrayRefParameter<"int64_t", "The workgroup size to use">:$workgroupSize,
+ OptionalParameter<"int64_t", "The subgroup size to use">:$subgroupSize,
OptionalParameter<"DictionaryAttr",
"Pipeline specific configuration">:$configuration
);
let builders = [
AttrBuilder<(ins "DispatchLoweringPassPipeline":$passPipeline,
CArg<"SymbolRefAttr", "{}">:$codegenSpec,
+ CArg<"ArrayRef<int64_t>", "{}">:$workgroupSize,
+ CArg<"std::optional<int64_t>", "std::nullopt">:$subgroupSize,
+ CArg<"DictionaryAttr", "{}">:$configuration)>,
+ AttrBuilder<(ins "DispatchLoweringPassPipeline":$passPipeline,
+ "ArrayRef<int64_t>":$workgroupSize,
+ CArg<"std::optional<int64_t>", "std::nullopt">:$subgroupSize,
CArg<"DictionaryAttr", "{}">:$configuration)>
];
let extraClassDeclaration = [{
@@ -266,23 +276,13 @@
}];
let parameters = (ins
AttrParameter<"LoweringConfigAttr", "">:$loweringConfig,
- AttrParameter<"TranslationInfoAttr", "">:$translationInfo,
- OptionalArrayRefParameter<"int64_t", "The workgroup size to use during translation.">:$workgroupSize,
- OptionalParameter<"std::optional<int64_t>",
- "The subgroup size to use during translation.">:$subgroupSize
+ AttrParameter<"TranslationInfoAttr", "">:$translationInfo
);
let assemblyFormat = [{
- `<` `lowering_config` `=` $loweringConfig `,` `translation_info` `=` $translationInfo
- (`,` `workgroup_size` `=` `[` $workgroupSize^ `]`)?
- (`,` `subgroup_size` `=` $subgroupSize^)? `>`
+ `<` `lowering_config` `=` $loweringConfig `,` `translation_info` `=` $translationInfo `>`
}];
- let extraClassDeclaration = [{
- SmallVector<int64_t> getWorkgroupSizeVals() {
- return SmallVector<int64_t>(getWorkgroupSize());
- }
- }];
let genVerifyDecl = 1;
}
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/test/lowering_config_attr.mlir b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/test/lowering_config_attr.mlir
index d5576af..e5e2f9b 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/test/lowering_config_attr.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/test/lowering_config_attr.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file %s | FileCheck %s
+// RUN: iree-opt --split-input-file --verify-diagnostics %s | FileCheck %s
module {
func.func @test() attributes {
@@ -49,15 +49,13 @@
func.func @test() attributes {
compilation_info = #iree_codegen.compilation_info<
lowering_config = <tile_sizes = []>,
- translation_info = <CPUDefault>,
- workgroup_size = [16, 4, 1],
- subgroup_size = 32>} {
+ translation_info = <CPUDefault workgroup_size = [16, 4, 1] subgroup_size = 32>>} {
return
}
}
// CHECK: #config = #iree_codegen.lowering_config<tile_sizes = []>
-// CHECK: #translation = #iree_codegen.translation_info<CPUDefault>
-// CHECK: #compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation, workgroup_size = [16, 4, 1], subgroup_size = 32>
+// CHECK: #translation = #iree_codegen.translation_info<CPUDefault workgroup_size = [16, 4, 1] subgroup_size = 32>
+// CHECK: #compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
// -----
@@ -91,3 +89,39 @@
}
// CHECK: #config = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128, 128, 0], {sizes = [1, [32], 0], interchange = [2, 1, 0]}, [0, 0, 1], [0, 0, 0]{{\]}}>
}
+
+// -----
+
+module {
+ /// translation info cannot have more than 3 entries for workgroup size
+ func.func @workgroup_size_more_than_3_err() attributes {
+ // expected-error @+1 {{workgroup size cannot have more than 3 entries}}
+ translation_info = #iree_codegen.translation_info<None workgroup_size = [4, 1, 1, 1]> {
+ return
+ }
+ }
+}
+
+// -----
+
+module {
+ /// translation info workgroup_size values needs to have non-negative values.
+ func.func @workgroup_size_neg_err() attributes {
+ // expected-error @+1 {{workgroup size value has to be greater than zero}}
+ translation_info = #iree_codegen.translation_info<None workgroup_size = [4, -1, 1]> {
+ return
+ }
+ }
+}
+
+// -----
+
+module {
+ /// translation info workgroup_size values needs to have non-negative values.
+ func.func @subgroup_size_neg_err() attributes {
+ // expected-error @+1 {{subgroup size value cannot be negative}}
+ translation_info = #iree_codegen.translation_info<None subgroup_size = -1> {
+ return
+ }
+ }
+}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 80b9c3c..002c34f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -2583,33 +2583,27 @@
return success();
}
-LogicalResult initCPULaunchConfig(ModuleOp moduleOp) {
- llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps =
- getAllEntryPoints(moduleOp);
- for (auto funcOp : moduleOp.getOps<mlir::FunctionOpInterface>()) {
- auto exportOp = exportOps.lookup(funcOp.getName());
- if (!exportOp)
- continue;
- if (getTranslationInfo(exportOp))
- continue;
+LogicalResult initCPULaunchConfig(FunctionOpInterface funcOp) {
+ if (getTranslationInfo(funcOp)) {
+ return success();
+ }
- // For now pick the default for functions with control flow, cause
- // the currently built pipelines dont work so well with control flow.
- if (funcOp.empty() || !llvm::hasSingleElement(funcOp.getFunctionBody())) {
- return lowerUsingDefaultPipeline(funcOp);
- }
+ // For now pick the default for functions with control flow, cause
+ // the currently built pipelines dont work so well with control flow.
+ if (funcOp.empty() || !llvm::hasSingleElement(funcOp.getFunctionBody())) {
+ return lowerUsingDefaultPipeline(funcOp);
+ }
- SmallVector<Operation *> computeOps = getComputeOps(funcOp);
- if (failed(setTranslationInfoAndRootConfig(funcOp, computeOps))) {
- return failure();
- }
+ SmallVector<Operation *> computeOps = getComputeOps(funcOp);
+ if (failed(setTranslationInfoAndRootConfig(funcOp, computeOps))) {
+ return failure();
}
// The root configuration setting introduces `tensor.dim` operations.
// Resolve those away.
- RewritePatternSet patterns(moduleOp.getContext());
+ RewritePatternSet patterns(funcOp.getContext());
memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
- return applyPatternsAndFoldGreedily(moduleOp, std::move(patterns));
+ return applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
}
} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.h
index 064be99..51c4f0a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.h
@@ -9,10 +9,11 @@
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
namespace mlir::iree_compiler {
-LogicalResult initCPULaunchConfig(ModuleOp moduleOp);
+LogicalResult initCPULaunchConfig(FunctionOpInterface funcOp);
} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUCheckIRBeforeLLVMConversion.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUCheckIRBeforeLLVMConversion.cpp
index b2b75a9..885681d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUCheckIRBeforeLLVMConversion.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUCheckIRBeforeLLVMConversion.cpp
@@ -102,15 +102,13 @@
return;
}
- auto moduleOp = getOperation();
- for (auto funcOp : moduleOp.getOps<mlir::FunctionOpInterface>()) {
- if (failed(checkStackAllocationSize(funcOp))) {
- return signalPassFailure();
- }
+ auto funcOp = getOperation();
+ if (failed(checkStackAllocationSize(funcOp))) {
+ return signalPassFailure();
}
}
-std::unique_ptr<OperationPass<ModuleOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMCPUCheckIRBeforeLLVMConversionPass(bool failOnOutOfBounds) {
return std::make_unique<LLVMCPUCheckIRBeforeLLVMConversionPass>(
failOnOutOfBounds);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index 667719b..da45714 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -14,6 +14,7 @@
#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/PDL/IR/PDL.h"
#include "mlir/Dialect/PDLInterp/IR/PDLInterp.h"
@@ -61,75 +62,38 @@
};
} // namespace
-/// Verify that valid configuration is set for all ops within the compiled
-/// module.
-template <typename F>
-static LogicalResult
-verifyLoweringConfiguration(ModuleOp module,
- IREE::Codegen::TranslationInfoAttr translationInfo,
- F verificationFn) {
- auto walkResult = module.walk([&](Operation *op) -> WalkResult {
- IREE::Codegen::LoweringConfigAttr loweringConfig = getLoweringConfig(op);
- if (!loweringConfig)
- return WalkResult::advance();
- TilingConfig tilingConfig(loweringConfig);
- return verificationFn(op, tilingConfig, translationInfo,
- ArrayRef<int64_t>{});
- });
- return failure(walkResult.wasInterrupted());
-}
-
// TODO(dcaballe): We temporarily need this utility to retrieve a valid
// lowering config. We should be able to remove this once we have a lowering
// config attribute per op.
-static FailureOr<LoweringConfigAttr> getRootLoweringConfig(ModuleOp moduleOp) {
- llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps =
- getAllEntryPoints(moduleOp);
- for (auto &it : exportOps) {
- auto exportOp = it.second;
- auto rootLoweringConfig = iree_compiler::getLoweringConfig(exportOp);
- if (rootLoweringConfig) {
- return rootLoweringConfig;
- }
- }
-
- for (auto funcOp : moduleOp.getOps<mlir::FunctionOpInterface>()) {
- getAllEntryPoints(moduleOp);
- SmallVector<Operation *> computeOps = getComputeOps(funcOp);
- // Check for self first.
- FailureOr<Operation *> rootOp = getRootOperation(computeOps);
- auto rootLoweringConfig = iree_compiler::getLoweringConfig(rootOp.value());
- if (rootLoweringConfig) {
- return rootLoweringConfig;
- }
+static FailureOr<LoweringConfigAttr>
+getRootLoweringConfig(FunctionOpInterface funcOp) {
+ SmallVector<Operation *> computeOps = getComputeOps(funcOp);
+ // Check for self first.
+ FailureOr<Operation *> rootOp = getRootOperation(computeOps);
+ auto rootLoweringConfig = iree_compiler::getLoweringConfig(rootOp.value());
+ if (rootLoweringConfig) {
+ return rootLoweringConfig;
}
return failure();
}
-static TilingConfig getTilingConfigForPipeline(ModuleOp moduleOp) {
- auto maybeLoweringConfig = getRootLoweringConfig(moduleOp);
+static TilingConfig getTilingConfigForPipeline(FunctionOpInterface funcOp) {
+ auto maybeLoweringConfig = getRootLoweringConfig(funcOp);
assert(succeeded(maybeLoweringConfig) &&
"Pipeline requires a lowering config");
return TilingConfig(*maybeLoweringConfig);
}
void LLVMCPULowerExecutableTargetPass::runOnOperation() {
- IREE::HAL::ExecutableVariantOp variantOp = getOperation();
- OpPassManager pipeline(IREE::HAL::ExecutableVariantOp::getOperationName());
-
- std::optional<IREE::Codegen::TranslationInfoAttr> translationInfo =
- getIdenticalTranslationInfo(variantOp);
- if (!translationInfo) {
- variantOp.emitOpError(
- "unhandled compilation of entry point functions with different "
- "translation info");
- return signalPassFailure();
+ auto funcOp = getOperation();
+ auto target = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
+ if (!target) {
+ // Do nothing without target
+ return;
}
- ModuleOp moduleOp = variantOp.getInnerModule();
LLVMCPUPipelineOptions pipelineOpts;
- auto target = variantOp.getTarget();
if (isX86(target) || isRISCV(target)) {
pipelineOpts.useConfiguredVectorSizes = false;
}
@@ -140,7 +104,14 @@
pipelineOpts.enableUkernels = hasUkernel(target);
pipelineOpts.enableAArch64SSVE =
isAArch64(target) && hasAnySVEFeature(target) && hasSMEFeature(target);
- switch (translationInfo.value().getDispatchLoweringPassPipeline()) {
+
+ IREE::Codegen::TranslationInfoAttr translationInfo =
+ getTranslationInfo(funcOp);
+ if (!translationInfo)
+ return;
+
+ OpPassManager pipeline(func::FuncOp::getOperationName());
+ switch (translationInfo.getDispatchLoweringPassPipeline()) {
// No pipleline specified, nothing to do.
case IREE::Codegen::DispatchLoweringPassPipeline::None:
return;
@@ -149,65 +120,58 @@
break;
case IREE::Codegen::DispatchLoweringPassPipeline::
CPUBufferOpsTileAndVectorize: {
- TilingConfig tilingConfig = getTilingConfigForPipeline(moduleOp);
+ TilingConfig tilingConfig = getTilingConfigForPipeline(funcOp);
addCPUBufferOpsTileAndVectorizePipeline(pipeline, tilingConfig,
pipelineOpts);
break;
}
case IREE::Codegen::DispatchLoweringPassPipeline::CPUDoubleTilingExpert: {
- TilingConfig tilingConfig = getTilingConfigForPipeline(moduleOp);
+ TilingConfig tilingConfig = getTilingConfigForPipeline(funcOp);
addMultiTilingExpertPassPipeline(pipeline, tilingConfig, pipelineOpts);
break;
}
case IREE::Codegen::DispatchLoweringPassPipeline::
CPUDoubleTilingPeelingExpert: {
- TilingConfig tilingConfig = getTilingConfigForPipeline(moduleOp);
+ TilingConfig tilingConfig = getTilingConfigForPipeline(funcOp);
pipelineOpts.enablePeeling = true;
addMultiTilingExpertPassPipeline(pipeline, tilingConfig, pipelineOpts);
break;
}
case IREE::Codegen::DispatchLoweringPassPipeline::
CPUConvTileAndDecomposeExpert: {
- TilingConfig tilingConfig = getTilingConfigForPipeline(moduleOp);
+ TilingConfig tilingConfig = getTilingConfigForPipeline(funcOp);
addConvTileAndDecomposeExpertPassPipeline(pipeline, tilingConfig,
pipelineOpts);
break;
}
case IREE::Codegen::DispatchLoweringPassPipeline::Mmt4dTilingExpert: {
- TilingConfig tilingConfig = getTilingConfigForPipeline(moduleOp);
+ TilingConfig tilingConfig = getTilingConfigForPipeline(funcOp);
addMmt4dTilingExpertPassPipeline(pipeline, tilingConfig, pipelineOpts);
break;
}
case IREE::Codegen::DispatchLoweringPassPipeline::CPUDataTiling: {
- TilingConfig tilingConfig = getTilingConfigForPipeline(moduleOp);
+ TilingConfig tilingConfig = getTilingConfigForPipeline(funcOp);
addCPUDataTilingPipeline(pipeline, tilingConfig, pipelineOpts);
break;
}
case IREE::Codegen::DispatchLoweringPassPipeline::
CPULinalgExtTileAndVectorize: {
- TilingConfig tilingConfig = getTilingConfigForPipeline(moduleOp);
+ TilingConfig tilingConfig = getTilingConfigForPipeline(funcOp);
addCPULinalgExtTileAndVectorizePipeline(pipeline, tilingConfig,
pipelineOpts);
break;
}
- // Transform-dialect pipelines.
- case IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen: {
- SymbolRefAttr codegenSpec = translationInfo.value().getCodegenSpec();
- addTransformDialectPasses(
- pipeline, codegenSpec ? codegenSpec.getLeafReference() : StringRef(""));
- break;
- }
default:
- moduleOp.emitOpError("Unsupported pipeline on CPU target.");
+ funcOp.emitOpError("Unsupported pipeline on CPU target.");
return signalPassFailure();
}
- if (failed(runPipeline(pipeline, variantOp))) {
+ if (failed(runPipeline(pipeline, funcOp))) {
return signalPassFailure();
}
}
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMCPULowerExecutableTargetPass() {
return std::make_unique<LLVMCPULowerExecutableTargetPass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectLoweringStrategy.cpp
index 11c71fb..89733f6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectLoweringStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUSelectLoweringStrategy.cpp
@@ -61,13 +61,13 @@
};
} // namespace
-/// Verify that valid configuration is set for all ops within the module.
+/// Verify that valid configuration is set for all ops within the funcOp.
template <typename F>
static LogicalResult
-verifyLoweringConfiguration(ModuleOp module,
+verifyLoweringConfiguration(FunctionOpInterface funcOp,
IREE::Codegen::TranslationInfoAttr translationInfo,
F verificationFn) {
- auto walkResult = module.walk([&](Operation *op) -> WalkResult {
+ auto walkResult = funcOp.walk([&](Operation *op) -> WalkResult {
IREE::Codegen::LoweringConfigAttr loweringConfig = getLoweringConfig(op);
if (!loweringConfig)
return WalkResult::advance();
@@ -79,46 +79,41 @@
}
void LLVMCPUSelectLoweringStrategyPass::runOnOperation() {
- IREE::HAL::ExecutableVariantOp variantOp = getOperation();
- ModuleOp moduleOp = variantOp.getInnerModule();
+ auto moduleOp = getOperation();
+ for (auto funcOp : moduleOp.getOps<FunctionOpInterface>()) {
+ // Set the strategy with default heuristics.
+ if (failed(initCPULaunchConfig(funcOp))) {
+ funcOp.emitOpError("failed to set lowering configuration");
+ return signalPassFailure();
+ }
- // Set the strategy with default heuristics.
- if (failed(initCPULaunchConfig(moduleOp))) {
- return signalPassFailure();
- }
+ auto translationInfo = getTranslationInfo(funcOp);
+ if (!translationInfo) {
+ continue;
+ }
- std::optional<IREE::Codegen::TranslationInfoAttr> translationInfo =
- getIdenticalTranslationInfo(variantOp);
- if (!translationInfo) {
- moduleOp.emitOpError(
- "unhandled compilation of entry point functions with different "
- "translation info");
- return signalPassFailure();
- }
-
- // Verify the configuration.
- LogicalResult verificationStatus = success();
- switch (translationInfo.value().getDispatchLoweringPassPipeline()) {
- case IREE::Codegen::DispatchLoweringPassPipeline::CPUDoubleTilingExpert:
- verificationStatus =
- verifyLoweringConfiguration(moduleOp, translationInfo.value(),
- verifyDoubleTilingExpertPassPipelineConfig);
- break;
- case IREE::Codegen::DispatchLoweringPassPipeline::
- CPUConvTileAndDecomposeExpert:
- verificationStatus =
- verifyLoweringConfiguration(moduleOp, translationInfo.value(),
- verifyConvTileAndDecomposeExpertConfig);
- break;
- default:
- break;
- }
- if (failed(verificationStatus)) {
- return signalPassFailure();
+ // Verify the configuration.
+ LogicalResult verificationStatus = success();
+ switch (translationInfo.getDispatchLoweringPassPipeline()) {
+ case IREE::Codegen::DispatchLoweringPassPipeline::CPUDoubleTilingExpert:
+ verificationStatus = verifyLoweringConfiguration(
+ funcOp, translationInfo, verifyDoubleTilingExpertPassPipelineConfig);
+ break;
+ case IREE::Codegen::DispatchLoweringPassPipeline::
+ CPUConvTileAndDecomposeExpert:
+ verificationStatus = verifyLoweringConfiguration(
+ funcOp, translationInfo, verifyConvTileAndDecomposeExpertConfig);
+ break;
+ default:
+ break;
+ }
+ if (failed(verificationStatus)) {
+ return signalPassFailure();
+ }
}
}
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
createLLVMCPUSelectLoweringStrategyPass() {
return std::make_unique<LLVMCPUSelectLoweringStrategyPass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 4aeffdb..ef9fbd6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -4,12 +4,14 @@
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#include "iree/compiler/Dialect/LinalgExt/Transforms/Passes.h"
#include "iree-dialects/Dialect/LinalgTransform/Passes.h"
#include "iree/compiler/Codegen/Common/CPU/Passes.h"
+#include "iree/compiler/Codegen/Common/PassUtils.h"
#include "iree/compiler/Codegen/Common/Passes.h"
#include "iree/compiler/Codegen/Common/TileSizeSelection.h"
#include "iree/compiler/Codegen/LLVMCPU/Passes.h"
+#include "iree/compiler/Dialect/LinalgExt/Transforms/Passes.h"
+#include "iree/compiler/Utils/PassUtils.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/CommandLine.h"
#include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h"
@@ -28,7 +30,7 @@
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/Passes.h"
-#define DEBUG_TYPE "iree-llvm-cpu-lowering-pass-pipeline"
+#define DEBUG_TYPE "iree-llvmcpu-pass-pipelines"
namespace mlir::iree_compiler {
@@ -93,20 +95,15 @@
"LLVMCPUMmt4dVectorLowering pass."),
llvm::cl::init(true));
-static void addTileAndDistributePasses(OpPassManager &pm) {
- pm.addPass(createTileAndDistributeToWorkgroupsPass());
- auto &nestedModulePM = pm.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(
- createConvertToDestinationPassingStylePass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createFoldAffineMinInDistributedLoopsPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createFuseTensorPadWithConsumerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createConcretizePadResultShapePass());
- nestedModulePM.addNestedPass<func::FuncOp>(
+static void addTileAndDistributePasses(OpPassManager &funcPassManager) {
+ funcPassManager.addPass(createTileAndDistributeToWorkgroupsPass());
+ funcPassManager.addPass(createConvertToDestinationPassingStylePass());
+ funcPassManager.addPass(createFoldAffineMinInDistributedLoopsPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createFuseTensorPadWithConsumerPass());
+ funcPassManager.addPass(createConcretizePadResultShapePass());
+ funcPassManager.addPass(
IREE::LinalgExt::createTileAndDecomposeWinogradTransformPass());
}
@@ -290,83 +287,71 @@
//===---------------------------------------------------------------------===//
void buildLLVMCPUVectorLoweringPipeline(
- OpPassManager &passManager,
+ OpPassManager &funcPassManager,
const LLVMCPUVectorLoweringPassOptions &options) {
- passManager.addNestedPass<func::FuncOp>(
- createLLVMCPUDropVectorUnitDimsPass());
- passManager.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(createLLVMCPUDropVectorUnitDimsPass());
+ funcPassManager.addPass(
createLLVMCPUVirtualVectorLoweringPass(options.splitVectorTransfersTo));
// Make sure we remove redundant vector ops (e.g., vector tranposes) before we
// lower them and can't be optimized away anymore.
- passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+ funcPassManager.addPass(createCanonicalizerPass());
- passManager.addNestedPass<func::FuncOp>(
- createLLVMCPUVectorTransferLoweringPass());
- passManager.addNestedPass<func::FuncOp>(
- createLLVMCPUVectorTransposeLoweringPass(
- options.lowerVectorTransposeToAVX2));
+ funcPassManager.addPass(createLLVMCPUVectorTransferLoweringPass());
+ funcPassManager.addPass(createLLVMCPUVectorTransposeLoweringPass(
+ options.lowerVectorTransposeToAVX2));
// Potentially removes shape_cast and broadcast on unit dims before shape_cast
// lowering.
- passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+ funcPassManager.addPass(createCanonicalizerPass());
// 'vector.shape_cast' are very expensive operations that are even generated
// by some of the lowerings above (e.g., transpose lowering). There are
// chances to cancel them out if they are not lowered too early so we lower
// them at the very end of the pass.
- passManager.addNestedPass<func::FuncOp>(
- createLLVMCPUVectorShapeCastLoweringPass());
+ funcPassManager.addPass(createLLVMCPUVectorShapeCastLoweringPass());
}
void addCPUBufferOpsTileAndVectorizePipeline(
- OpPassManager &passManager, TilingConfig &tilingConfig,
+ OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
- addTileAndDistributePasses(passManager);
+ addTileAndDistributePasses(funcPassManager);
// Skip tiling reduction loops because this is expected to apply on copy ops
// only.
- OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(
createLLVMCPUTilePass(tilingConfig.getVectorCommonParallelLevel()));
- nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUPeelPass());
+ funcPassManager.addPass(createLLVMCPUPeelPass());
{
GenericVectorizationPassOptions options;
options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes;
options.enableVectorMasking = pipelineOpt.enableVectorMasking;
options.vectorizeGatherAccesses = true;
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGenericVectorizationPass(options));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createGenericVectorizationPass(options));
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
// Run IREE specific passes before vector lowering expert.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRemoveSingleIterationLoopPass());
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
{
LLVMCPUVectorLoweringPassOptions options;
options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
options.splitVectorTransfersTo = "linalg-copy";
- buildLLVMCPUVectorLoweringPipeline(nestedModulePM, options);
+ buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
}
- if (pipelineOpt.enableAArch64SSVE) {
- nestedModulePM.addNestedPass<func::FuncOp>(
- mlir::arm_sme::createEnableArmStreamingPass(
- mlir::arm_sme::ArmStreamingMode::StreamingLocally));
- }
+ if (pipelineOpt.enableAArch64SSVE)
+ funcPassManager.addPass(mlir::arm_sme::createEnableArmStreamingPass(
+ mlir::arm_sme::ArmStreamingMode::StreamingLocally));
}
-void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
+void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
- addTileAndDistributePasses(passManager);
-
- OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
+ addTileAndDistributePasses(funcPassManager);
SmallVector<int64_t> allFusableLevels(tilingConfig.getFusableLevels());
// Apply tile and fuse to all the non-distribution fusable levels. Skip
@@ -378,157 +363,132 @@
if (i == tilingConfig.getDistributionLevel())
continue;
if (fusableLevels.contains(i)) {
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLLVMCPUTileAndFusePass(i));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createFuseTensorPadWithConsumerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createConcretizePadResultShapePass());
+ funcPassManager.addPass(createLLVMCPUTileAndFusePass(i));
+ funcPassManager.addPass(createFuseTensorPadWithConsumerPass());
+ funcPassManager.addPass(createConcretizePadResultShapePass());
continue;
}
if (i == tilingConfig.getVectorReductionLevel()) {
// Run SplitReductionPass before the final reduction Fuse pass, because
// SplitReductionPass takes care of banked-tiling.
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(
createLLVMCPUSplitReductionPass(clEnableReassociateFpReductions));
- nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUTilePass(i));
+ funcPassManager.addPass(createLLVMCPUTilePass(i));
continue;
}
- nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUTilePass(i));
+ funcPassManager.addPass(createLLVMCPUTilePass(i));
}
}
if (pipelineOpt.enablePeeling) {
- nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUPeelPass());
+ funcPassManager.addPass(createLLVMCPUPeelPass());
}
if (pipelineOpt.enableAArch64SSVE) {
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLLVMCPU2DScalableTo1DScalablePass());
+ funcPassManager.addPass(createLLVMCPU2DScalableTo1DScalablePass());
}
{
- nestedModulePM.addNestedPass<func::FuncOp>(createVectorizePadPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createDecomposePackUnPackOpsPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createVectorizePadPass());
+ funcPassManager.addPass(createDecomposePackUnPackOpsPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
GenericVectorizationPassOptions options;
options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes;
options.enableVectorMasking = pipelineOpt.enableVectorMasking;
options.vectorizePadding = true;
options.vectorizeGatherAccesses = true;
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGenericVectorizationPass(options));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createGenericVectorizationPass(options));
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
- addCPUBufferizePasses(nestedModulePM);
+ addCPUBufferizePasses(funcPassManager);
// Run IREE specific passes before vector lowering expert.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRemoveSingleIterationLoopPass());
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
{
LLVMCPUVectorLoweringPassOptions options;
options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
options.splitVectorTransfersTo = "linalg-copy";
- buildLLVMCPUVectorLoweringPipeline(nestedModulePM, options);
+ buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
}
- if (pipelineOpt.enableAArch64SSVE) {
- nestedModulePM.addNestedPass<func::FuncOp>(
- mlir::arm_sme::createEnableArmStreamingPass(
- mlir::arm_sme::ArmStreamingMode::StreamingLocally));
- }
+ if (pipelineOpt.enableAArch64SSVE)
+ funcPassManager.addPass(mlir::arm_sme::createEnableArmStreamingPass(
+ mlir::arm_sme::ArmStreamingMode::StreamingLocally));
}
void addConvTileAndDecomposeExpertPassPipeline(
- OpPassManager &passManager, TilingConfig &tilingConfig,
+ OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
- addTileAndDistributePasses(passManager);
+ addTileAndDistributePasses(funcPassManager);
- OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
// Run LLVMTileAndFuse firstly in case that we have fill + conv + generic
// ops. At this stage, we do not apply vectorization. The reduction dim won't
// get tiled if the case is conv + generic op. In this case, we have to tile
// along reduction dim again, which needs them to be Linalg ops form.
- nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUTileAndFusePass(
+ funcPassManager.addPass(createLLVMCPUTileAndFusePass(
tilingConfig.getVectorCommonParallelLevel()));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createFuseTensorPadWithConsumerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createConcretizePadResultShapePass());
+ funcPassManager.addPass(createFuseTensorPadWithConsumerPass());
+ funcPassManager.addPass(createConcretizePadResultShapePass());
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(
createLLVMCPUTilePass(tilingConfig.getVectorReductionLevel()));
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(
createLLVMCPUTileAndFusePass(tilingConfig.getVectorInnerParallelLevel()));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createDecomposeConvolutionToLowerDimOpsPass());
+ funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createFuseTensorPadWithConsumerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createConcretizePadResultShapePass());
+ funcPassManager.addPass(createFuseTensorPadWithConsumerPass());
+ funcPassManager.addPass(createConcretizePadResultShapePass());
{
- nestedModulePM.addNestedPass<func::FuncOp>(createVectorizePadPass());
+ funcPassManager.addPass(createVectorizePadPass());
GenericVectorizationPassOptions options;
options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes;
options.enableVectorMasking = pipelineOpt.enableVectorMasking;
options.vectorizePadding = true;
options.vectorizeGatherAccesses = true;
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGenericVectorizationPass(options));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createGenericVectorizationPass(options));
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
// Eliminate redundant transfer_read/write to avoid stack allocations.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeVectorTransferPass(/*flatten=*/true));
+ funcPassManager.addPass(createOptimizeVectorTransferPass(/*flatten=*/true));
- addCPUBufferizePasses(nestedModulePM);
+ addCPUBufferizePasses(funcPassManager);
// Run IREE specific passes before vector lowering expert.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRemoveSingleIterationLoopPass());
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
{
LLVMCPUVectorLoweringPassOptions options;
options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
options.splitVectorTransfersTo = "shuffle";
- buildLLVMCPUVectorLoweringPipeline(nestedModulePM, options);
+ buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
}
- if (pipelineOpt.enableAArch64SSVE) {
- nestedModulePM.addNestedPass<func::FuncOp>(
- mlir::arm_sme::createEnableArmStreamingPass(
- mlir::arm_sme::ArmStreamingMode::StreamingLocally));
- }
+ if (pipelineOpt.enableAArch64SSVE)
+ funcPassManager.addPass(mlir::arm_sme::createEnableArmStreamingPass(
+ mlir::arm_sme::ArmStreamingMode::StreamingLocally));
}
-void addMmt4dTilingExpertPassPipeline(OpPassManager &passManager,
+void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
- addTileAndDistributePasses(passManager);
-
- OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
+ addTileAndDistributePasses(funcPassManager);
if (pipelineOpt.enableUkernels) {
- nestedModulePM.addNestedPass<func::FuncOp>(
- createDecomposeBatchMmt4DOpsPass());
- nestedModulePM.addPass(
+ funcPassManager.addPass(createDecomposeBatchMmt4DOpsPass());
+ funcPassManager.addPass(
createCPULowerToUKernelsPass(clSkipIntermediateRoundings));
}
@@ -547,21 +507,20 @@
if (i == tilingConfig.getDistributionLevel())
continue;
if (fusableLevels.contains(i)) {
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLLVMCPUTileAndFusePass(i));
+ funcPassManager.addPass(createLLVMCPUTileAndFusePass(i));
continue;
}
if (i == tilingConfig.getVectorReductionLevel()) {
// Run SplitReductionPass before the final reduction Fuse pass, because
// SplitReductionPass takes care of banked-tiling.
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(
createLLVMCPUSplitReductionPass(clEnableReassociateFpReductions));
- nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUTilePass(i));
+ funcPassManager.addPass(createLLVMCPUTilePass(i));
continue;
}
- nestedModulePM.addNestedPass<func::FuncOp>(createLLVMCPUTilePass(i));
+ funcPassManager.addPass(createLLVMCPUTilePass(i));
}
}
@@ -570,271 +529,249 @@
options.enableVectorMasking = pipelineOpt.enableVectorMasking;
options.vectorizePadding = true;
options.vectorizeGatherAccesses = true;
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGenericVectorizationPass(options));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createGenericVectorizationPass(options));
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- addCPUBufferizePasses(nestedModulePM);
+ addCPUBufferizePasses(funcPassManager);
// Vector lowering of Mmt4d.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLLVMCPUMmt4dVectorLoweringPass(
- clEnableVectorContractCustomKernels));
+ funcPassManager.addPass(createLLVMCPUMmt4dVectorLoweringPass(
+ clEnableVectorContractCustomKernels));
// Generic vector lowering.
LLVMCPUVectorLoweringPassOptions options;
options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
options.splitVectorTransfersTo = "linalg-copy";
- buildLLVMCPUVectorLoweringPipeline(nestedModulePM, options);
+ buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
}
-void addCPUDataTilingPipeline(OpPassManager &passManager,
+void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
- addTileAndDistributePasses(passManager);
- OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(
+ addTileAndDistributePasses(funcPassManager);
+ funcPassManager.addPass(
createLLVMCPUTilePass(tilingConfig.getVectorCommonParallelLevel()));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createDecomposePackUnPackOpsPass());
+ funcPassManager.addPass(createDecomposePackUnPackOpsPass());
{
GenericVectorizationPassOptions options;
options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes;
options.vectorizePadding = true;
options.enableVectorMasking = pipelineOpt.enableVectorMasking;
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGenericVectorizationPass(options));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createGenericVectorizationPass(options));
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
- addCPUBufferizePasses(nestedModulePM);
+ addCPUBufferizePasses(funcPassManager);
{
LLVMCPUVectorLoweringPassOptions options;
options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
options.splitVectorTransfersTo = "linalg-copy";
- buildLLVMCPUVectorLoweringPipeline(nestedModulePM, options);
+ buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
}
}
void addCPULinalgExtTileAndVectorizePipeline(
- OpPassManager &passManager, TilingConfig &tilingConfig,
+ OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt) {
- addTileAndDistributePasses(passManager);
- OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(
+ addTileAndDistributePasses(funcPassManager);
+ funcPassManager.addPass(
createLLVMCPUTilePass(tilingConfig.getVectorCommonParallelLevel()));
// TODO: Should only apply decomposition here?
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(
IREE::LinalgExt::createTileAndDecomposeAttentionPass());
{
GenericVectorizationPassOptions options;
options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes;
options.enableVectorMasking = pipelineOpt.enableVectorMasking;
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGenericVectorizationPass(options));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createGenericVectorizationPass(options));
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
- addCPUBufferizePasses(nestedModulePM);
+ addCPUBufferizePasses(funcPassManager);
{
LLVMCPUVectorLoweringPassOptions options;
options.lowerVectorTransposeToAVX2 = pipelineOpt.lowerToAVX2;
options.splitVectorTransfersTo = "linalg-copy";
- buildLLVMCPUVectorLoweringPipeline(nestedModulePM, options);
+ buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
}
}
-void addCPUDefaultPassPipeline(OpPassManager &passManager) {
- addTileAndDistributePasses(passManager);
- OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
- addCPUBufferizePasses(nestedModulePM);
+void addCPUDefaultPassPipeline(OpPassManager &funcPassManager) {
+ addTileAndDistributePasses(funcPassManager);
+ addCPUBufferizePasses(funcPassManager);
}
-void addTransformDialectPasses(OpPassManager &passManager,
- StringRef entryPoint) {
- // Give control to the transform dialect.
- passManager.addPass(
- mlir::iree_compiler::createTransformDialectInterpreterPass(entryPoint));
- // Dropping the schedule is needed:
- // 1. if we want to embed the transform in the module: we should drop the
- // schedule once applied.
- // 2. if transform.do_not_dce_operands ops are introduced.
- passManager.addPass(createDropSchedulePass());
-}
-
-static void addLowerToLLVMPasses(OpPassManager &passManager,
+static void addLowerToLLVMPasses(OpPassManager &modulePassManager,
bool enableAArch64SME) {
// TODO: Remove the following pass and plumb support for #hal.descriptor_type
// memory space through the stack.
- passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
+ FunctionLikeNest(modulePassManager)
+ .addPass(createEraseHALDescriptorTypeFromMemRefPass);
// Lower `ukernel.*` ops to function calls
- passManager.addPass(createLowerUKernelOpsToCallsPass());
+ modulePassManager.addPass(createLowerUKernelOpsToCallsPass());
- // LinalgExt -> SCF
- passManager.addNestedPass<func::FuncOp>(
- IREE::LinalgExt::createLinalgExtToLoopsPass());
-
- // Linalg -> SCF
- passManager.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
- if (clCheckLinalgVectorization) {
- passManager.addNestedPass<func::FuncOp>(
- createLLVMCPUEmitVectorizationRemarksPass());
- }
- passManager.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());
- passManager.addPass(createConvertBf16ArithToF32Pass());
- passManager.addPass(createConvertBf16ToUInt16BuffersPass());
- passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- passManager.addNestedPass<func::FuncOp>(createCSEPass());
+ FunctionLikeNest(modulePassManager)
+ // LinalgExt -> SCF
+ .addPass(IREE::LinalgExt::createLinalgExtToLoopsPass)
+ // Linalg -> SCF
+ .addPass(createMemrefCopyToLinalgPass)
+ .addPredicatedPass(clCheckLinalgVectorization,
+ createLLVMCPUEmitVectorizationRemarksPass)
+ .addPass(createConvertLinalgToLoopsPass)
+ .addPass(createConvertBf16ArithToF32Pass)
+ .addPass(createConvertBf16ToUInt16BuffersPass)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass);
// Handled tensor-type constants.
- passManager.addPass(arith::createConstantBufferizePass());
- passManager.addPass(createFoldTensorExtractOpPass());
+ modulePassManager.addPass(arith::createConstantBufferizePass());
- // Handle complex operation conversion.
- passManager.addPass(createConvertComplexToStandardPass());
-
- // math dialect elementry functions -> polynomial form.
- passManager.addNestedPass<func::FuncOp>(createPolynomialApproximationPass());
-
- passManager.addNestedPass<func::FuncOp>(
- createHoistStaticallyBoundAllocationsPass());
-
- // Use `arith.minf/maxf` instead of `arith.minimumf/maximumf`.
- if (clUseFastMinMaxOps) {
- passManager.addNestedPass<func::FuncOp>(createReplaceSlowMinMaxOpsPass());
- }
+ FunctionLikeNest(modulePassManager)
+ .addPass(createFoldTensorExtractOpPass)
+ // Handle complex operation conversion.
+ .addPass(createConvertComplexToStandardPass)
+ // math dialect elementry functions -> polynomial form.
+ .addPass(createPolynomialApproximationPass)
+ .addPass(createHoistStaticallyBoundAllocationsPass)
+ // Use `arith.minf/maxf` instead of `arith.minimumf/maximumf`.
+ .addPredicatedPass(clUseFastMinMaxOps, createReplaceSlowMinMaxOpsPass);
if (enableAArch64SME) {
- // Decompose large (2D-scalable) vector types to (multiple) SME tiles
- // + some ArmSME specific vector dialect rewrites.
- passManager.addPass(mlir::arm_sme::createVectorLegalizationPass());
- passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- passManager.addNestedPass<func::FuncOp>(createCSEPass());
-
- // (Arith, Vector) -> ArmSME
- passManager.addNestedPass<func::FuncOp>(
- mlir::createArithToArmSMEConversionPass());
- passManager.addNestedPass<func::FuncOp>(
- mlir::createConvertVectorToArmSMEPass());
- passManager.addNestedPass<func::FuncOp>(
- mlir::arm_sme::createTileAllocationPass());
- passManager.addNestedPass<func::FuncOp>(
- mlir::arm_sme::createEnableArmStreamingPass(
- mlir::arm_sme::ArmStreamingMode::StreamingLocally,
- mlir::arm_sme::ArmZaMode::NewZA,
- /*onlyIfRequiredByOps=*/true));
- passManager.addNestedPass<func::FuncOp>(
- mlir::createConvertArmSMEToSCFPass());
+ modulePassManager.addPass(mlir::arm_sme::createVectorLegalizationPass());
+ FunctionLikeNest(modulePassManager)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass)
+ .addPass(mlir::createArithToArmSMEConversionPass)
+ .addPass(mlir::createConvertVectorToArmSMEPass)
+ .addPass(mlir::arm_sme::createTileAllocationPass)
+ .addPass([]() {
+ return mlir::arm_sme::createEnableArmStreamingPass(
+ mlir::arm_sme::ArmStreamingMode::StreamingLocally,
+ mlir::arm_sme::ArmZaMode::NewZA,
+ /*onlyIfRequiredByOps=*/true);
+ })
+ .addPass(mlir::createConvertArmSMEToSCFPass);
}
- // Resolve get_buffer_descriptor ops. All structural buffer manipulations
- // must conclude before this point.
- passManager.addNestedPass<func::FuncOp>(
- createIREEExpandStridedMetadataPass());
- passManager.addNestedPass<func::FuncOp>(createCleanupBufferAllocViewPass());
+ FunctionLikeNest(modulePassManager)
+ // Resolve get_buffer_descriptor ops. All structural buffer manipulations
+ // must conclude before this point.
+ .addPass(createIREEExpandStridedMetadataPass)
+ .addPass(createCleanupBufferAllocViewPass)
+ // Checking stack allocation before converting to CF dialect is easier.
+ .addPass([&]() {
+ return createLLVMCPUCheckIRBeforeLLVMConversionPass(
+ clFailOnOutOfBoundsStackAllocation);
+ })
+ // SCF -> CF
+ .addPass(createConvertSCFToCFPass)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass)
+ // (HAL, IREE, Linalg, CF) -> LLVM
+ .addPass(arith::createArithExpandOpsPass)
+ .addPass(memref::createExpandOpsPass)
+ .addPass(memref::createFoldMemRefAliasOpsPass)
+ .addPass(createEmulateNarrowTypePass)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass)
+ .addPredicatedPass(clInstrumentMemoryAccesses,
+ createInstrumentMemoryAccessesPass);
- // Checking stack allocation before converting to CF dialect is easier.
- passManager.addPass(createLLVMCPUCheckIRBeforeLLVMConversionPass(
- clFailOnOutOfBoundsStackAllocation));
-
- // SCF -> CF
- passManager.addNestedPass<func::FuncOp>(createConvertSCFToCFPass());
- passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- passManager.addNestedPass<func::FuncOp>(createCSEPass());
-
- // (HAL, IREE, Linalg, CF) -> LLVM
- passManager.addNestedPass<func::FuncOp>(arith::createArithExpandOpsPass());
- passManager.addNestedPass<func::FuncOp>(memref::createExpandOpsPass());
- passManager.addPass(memref::createFoldMemRefAliasOpsPass());
- passManager.addPass(createEmulateNarrowTypePass());
- passManager.addPass(createCanonicalizerPass());
- passManager.addPass(createCSEPass());
- if (clInstrumentMemoryAccesses) {
- passManager.addNestedPass<func::FuncOp>(
- createInstrumentMemoryAccessesPass());
- }
if (enableAArch64SME) {
- passManager.addPass(createConvertArmSMEToLLVMPass());
+ modulePassManager.addPass(createConvertArmSMEToLLVMPass());
}
- passManager.addPass(createConvertToLLVMPass(clEnableReassociateFpReductions));
- passManager.addPass(createReconcileUnrealizedCastsPass());
+ modulePassManager.addPass(
+ createConvertToLLVMPass(clEnableReassociateFpReductions));
+ modulePassManager.addPass(createReconcileUnrealizedCastsPass());
// We rely on MLIR symbol visibility being correct after this point and need
// to mirror the LLVM linkage that was assigned during conversion.
- passManager.addPass(createLLVMCPUSynchronizeSymbolVisibilityPass());
+ modulePassManager.addPass(createLLVMCPUSynchronizeSymbolVisibilityPass());
- passManager.addPass(createCanonicalizerPass());
- passManager.addPass(createCSEPass());
- passManager.addNestedPass<LLVM::LLVMFuncOp>(createAddFastMathFlagsPass());
+ modulePassManager.addPass(createCanonicalizerPass());
+ modulePassManager.addPass(createCSEPass());
+ modulePassManager.addNestedPass<LLVM::LLVMFuncOp>(
+ createAddFastMathFlagsPass());
}
-void buildLLVMCPUCodegenConfigurationPassPipeline(OpPassManager &passManager) {
+void buildLLVMCPUCodegenConfigurationPassPipelineImpl(
+ OpPassManager &modulePassManager) {
{
- addCommonTargetExecutablePreprocessingPasses(passManager,
+ FunctionLikeNest funcPassManager(modulePassManager);
+ addCommonTargetExecutablePreprocessingPasses(funcPassManager,
clUseSoftmaxInterFusion);
- OpPassManager &modulePassManager = passManager.nest<ModuleOp>();
- modulePassManager.addNestedPass<func::FuncOp>(
- createRematerializeParallelOpsPass());
- // TODO(#13888): This(createExpandF16OpToF32Pass()) pass is being added way
- // to late and should insted be be done during lowering to LLVM.
- modulePassManager.addPass(createExpandF16OpToF32Pass());
-
- modulePassManager.addNestedPass<func::FuncOp>(
- createCPUMaterializeEncodingPass());
- // TODO: Remove the following pass the plumb support for
- // #hal.descriptor_type memory space through the stack.
- modulePassManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
}
+ modulePassManager.addPass(createMaterializeUserConfigsPass());
+ FunctionLikeNest(modulePassManager)
+ .addPass(createRematerializeParallelOpsPass)
+ // TODO(#13888): This(createExpandF16OpToF32Pass()) pass is being added
+ // way to late and should insted be be done during lowering to LLVM.
+ .addPass(createExpandF16OpToF32Pass)
+ .addPass([&]() { return createCPUMaterializeEncodingPass(); })
+ // TODO: Remove the following pass the plumb support for
+ // #hal.descriptor_type memory space through the stack.
+ .addPass(createEraseHALDescriptorTypeFromMemRefPass);
- passManager.addPass(createLLVMCPUSelectLoweringStrategyPass());
+ modulePassManager.addPass(createLLVMCPUSelectLoweringStrategyPass());
+ LLVM_DEBUG({
+ llvm::dbgs() << "LLVMCPU codegen configuration pass pipeline:\n";
+ modulePassManager.printAsTextualPipeline(llvm::dbgs());
+ llvm::dbgs() << "\n";
+ });
}
-void buildLLVMCPUCodegenPassPipeline(OpPassManager &passManager,
- bool enableAArch64SME) {
- passManager.addPass(createLLVMCPULowerExecutableTargetPass());
- OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
- addLowerToLLVMPasses(nestedModulePM, enableAArch64SME);
+void buildLLVMCPUCodegenConfigurationPassPipeline(
+ OpPassManager &variantPassManager) {
+ OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
+ buildLLVMCPUCodegenConfigurationPassPipelineImpl(modulePassManager);
+}
+void buildLLVMCPUCodegenPassPipeline(OpPassManager &variantPassManager,
+ bool enableAArch64SME) {
+ OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
+ modulePassManager.addPass(createLowerExecutableUsingTransformDialectPass());
+ FunctionLikeNest(modulePassManager)
+ .addPass(createLLVMCPULowerExecutableTargetPass);
+
+ // Run conversion to LLVM at `ModuleOp` granularity.
+ addLowerToLLVMPasses(modulePassManager, enableAArch64SME);
LLVM_DEBUG({
- llvm::dbgs() << "Using LLVMCPU pass pipeline:\n";
- passManager.printAsTextualPipeline(llvm::dbgs());
+ llvm::dbgs() << "LLVMCPU codegen pass pipeline:\n";
+ variantPassManager.printAsTextualPipeline(llvm::dbgs());
llvm::dbgs() << "\n";
});
}
// NOTE: this runs on the top-level program module containing all
// hal.executable ops.
-void buildLLVMCPULinkingPassPipeline(OpPassManager &passManager) {
+void buildLLVMCPULinkingPassPipeline(OpPassManager &modulePassManager) {
// Link together executables. This may produce some IR duplication.
- passManager.addPass(createLLVMCPULinkExecutablesPass());
+ modulePassManager.addPass(createLLVMCPULinkExecutablesPass());
// Cleanup IR duplication.
- passManager.addNestedPass<IREE::HAL::ExecutableOp>(
+ modulePassManager.addNestedPass<IREE::HAL::ExecutableOp>(
mlir::createCanonicalizerPass());
// Assign final executable constant and import ordinals.
- auto &variantPM = passManager.nest<IREE::HAL::ExecutableOp>()
- .nest<IREE::HAL::ExecutableVariantOp>();
- variantPM.addPass(createLLVMCPUAssignConstantOrdinalsPass());
- variantPM.addPass(createLLVMCPUAssignImportOrdinalsPass());
+ auto &variantPassManager = modulePassManager.nest<IREE::HAL::ExecutableOp>()
+ .nest<IREE::HAL::ExecutableVariantOp>();
+ variantPassManager.addPass(createLLVMCPUAssignConstantOrdinalsPass());
+ variantPassManager.addPass(createLLVMCPUAssignImportOrdinalsPass());
}
//===---------------------------------------------------------------------===//
@@ -853,36 +790,38 @@
static PassPipelineRegistration<> LLVMCPUConfigPipeline(
"iree-codegen-llvmcpu-configuration-pipeline",
"Runs the translation strategy configuration pipeline on Linalg for CPU",
- [](OpPassManager &passManager) {
- buildLLVMCPUCodegenConfigurationPassPipeline(passManager);
+ [](OpPassManager &modulePassManager) {
+ buildLLVMCPUCodegenConfigurationPassPipeline(modulePassManager);
});
static PassPipelineRegistration<> LLVMCPUBufferizationPipeline(
"iree-codegen-llvmcpu-bufferization-pipeline",
"Runs the bufferization pipeline for CPU",
- [](OpPassManager &passManager) { addCPUBufferizePasses(passManager); });
+ [](OpPassManager &funcPassManager) {
+ addCPUBufferizePasses(funcPassManager);
+ });
static PassPipelineRegistration<> LLVMCPUVectorLoweringPipeline(
"iree-codegen-llvmcpu-vector-lowering-pipeline",
"Runs the translation strategy configuration pipeline on Linalg for CPU",
- [](OpPassManager &passManager) {
+ [](OpPassManager &funcPassManager) {
LLVMCPUVectorLoweringPassOptions options;
options.splitVectorTransfersTo = "linalg-copy";
- buildLLVMCPUVectorLoweringPipeline(passManager, options);
+ buildLLVMCPUVectorLoweringPipeline(funcPassManager, options);
});
static PassPipelineRegistration<> LinalgLLVMPipeline(
"iree-codegen-linalg-to-llvm-pipeline",
"Runs the progressive lowering pipeline from Linalg to LLVM",
- [](OpPassManager &passManager) {
- buildLLVMCPUCodegenPassPipeline(passManager);
+ [](OpPassManager &variantPassManager) {
+ buildLLVMCPUCodegenPassPipeline(variantPassManager);
});
static PassPipelineRegistration<> LLVMCPULinkingPipeline(
"iree-codegen-llvmcpu-linking-pipeline",
"Runs the LLVMCPU HAL executable linking pipeline",
- [](OpPassManager &passManager) {
- buildLLVMCPULinkingPassPipeline(passManager);
+ [](OpPassManager &modulePassManager) {
+ buildLLVMCPULinkingPassPipeline(modulePassManager);
});
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
index 2efe83a..10acd2d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
@@ -24,7 +24,7 @@
createConvertToLLVMPass(bool reassociateFpReordering = false);
/// Checks CPU backend specific IR constraints (like no stack allocations)
-std::unique_ptr<OperationPass<ModuleOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMCPUCheckIRBeforeLLVMConversionPass(bool failOnOutOfBounds = true);
std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
@@ -33,13 +33,13 @@
/// Pass to select a lowering strategy for a hal.executable.variant operation.
/// The variant is annotated with the selected strategies, which are
/// subsequently ingested by LLVMCPULowerExecutableTargetPass.
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
createLLVMCPUSelectLoweringStrategyPass();
/// Pass to lower the module an hal.executable.variant operation to external
/// dialect. Currently this pass lowers to LLVM dialect, but could be
/// generalized to lower to any "final" dialect like SPIR-V/NVVM, etc.
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
createLLVMCPULowerExecutableTargetPass();
/// Pass to handel F16 bit operations, but converting f16 operands to F32.
@@ -116,7 +116,7 @@
// Verifies that only supported IR constructs are passed to the compiler (like
// no Linalg transform markers are set).
-std::unique_ptr<OperationPass<ModuleOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createVerifyLinalgTransformLegalityPass();
//------------------------------------------------------------------------------
@@ -148,44 +148,40 @@
/// pipeline is only used for dispatches that just copy data from input
/// interfaces to output interface.
void addCPUBufferOpsTileAndVectorizePipeline(
- OpPassManager &passManager, TilingConfig &tilingConfig,
+ OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt);
/// Populates the passes to lower ops through data tiling transformations.
-void addCPUDataTilingPipeline(OpPassManager &passManager,
+void addCPUDataTilingPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt);
void addCPULinalgExtTileAndVectorizePipeline(
- OpPassManager &passManager, TilingConfig &tilingConfig,
+ OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt);
/// Populates the passes to lower to scalars operations for linalg based
/// code-generation. This pipeline does not vectorize, but instead just
/// converts to memrefs
-void addCPUDefaultPassPipeline(OpPassManager &passManager);
+void addCPUDefaultPassPipeline(OpPassManager &funcPassManager);
void addConvTileAndDecomposeExpertPassPipeline(
- OpPassManager &passManager, TilingConfig &tilingConfig,
+ OpPassManager &funcPassManager, TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt);
/// Populates the passes needed to multi level tile, fuse and vectorize
/// lowering of linalg ops on tensors to vectors operations.
-void addMmt4dTilingExpertPassPipeline(OpPassManager &passManager,
+void addMmt4dTilingExpertPassPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt);
-void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
+void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
TilingConfig &tilingConfig,
LLVMCPUPipelineOptions &pipelineOpt);
-void addTensorToVectorsPassPipeline(OpPassManager &passManager,
+void addTensorToVectorsPassPipeline(OpPassManager &funcPassManager,
bool lowerToVectors = true);
-/// Transform dialect-based common.
-void addTransformDialectPasses(OpPassManager &passManager,
- StringRef entryPoint);
-
// Populates the passes needed to do tiling, decomposing, and vectorizing the
// convolution ops.
LogicalResult verifyConvTileAndDecomposeExpertConfig(
@@ -212,12 +208,13 @@
/// Populates passes needed for preprocessing before codegen lowerings, as well
/// as high level lowering strategy selection.
-void buildLLVMCPUCodegenConfigurationPassPipeline(OpPassManager &passManager);
+void buildLLVMCPUCodegenConfigurationPassPipeline(
+ OpPassManager &variantPassManager);
/// Populates passes needed to lower a XLA HLO op to LLVM dialect via the
/// structured ops path. The pass manager `pm` in here should operate on the
/// module within the IREE::HAL::ExecutableOp.
-void buildLLVMCPUCodegenPassPipeline(OpPassManager &passManager,
+void buildLLVMCPUCodegenPassPipeline(OpPassManager &variantPassManager,
bool enableAArch64SME = false);
//----------------------------------------------------------------------------//
@@ -237,7 +234,7 @@
createLLVMCPULinkExecutablesPass();
/// Populates passes needed to link HAL executables across LLVMCPU targets.
-void buildLLVMCPULinkingPassPipeline(OpPassManager &passManager);
+void buildLLVMCPULinkingPassPipeline(OpPassManager &modulePassManager);
//----------------------------------------------------------------------------//
// Register LLVMCPU Passes
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td
index 7c77d56..02c455d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td
@@ -46,7 +46,7 @@
}
def LLVMCPUCheckIRBeforeLLVMConversion :
- Pass<"iree-llvmcpu-check-ir-before-llvm-conversion", "ModuleOp"> {
+ InterfacePass<"iree-llvmcpu-check-ir-before-llvm-conversion", "mlir::FunctionOpInterface"> {
let summary = "Checks CPU backend specific IR constraints (like no allocas)";
let constructor = "mlir::iree_compiler::createLLVMCPUCheckIRBeforeLLVMConversionPass()";
let options = [
@@ -70,8 +70,7 @@
}
def LLVMCPULowerExecutableTarget :
- Pass<"iree-llvmcpu-lower-executable-target",
- "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> {
+ InterfacePass<"iree-llvmcpu-lower-executable-target", "mlir::FunctionOpInterface"> {
let summary =
"Lower executable target using an IREE::HAL::DispatchLoweringPassPipeline";
let constructor =
@@ -98,8 +97,7 @@
}
def LLVMCPUSelectLoweringStrategy :
- Pass<"iree-llvmcpu-select-lowering-strategy",
- "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> {
+ Pass<"iree-llvmcpu-select-lowering-strategy", "ModuleOp"> {
let summary =
"Select a IREE::HAL::DispatchLoweringPassPipeline for lowering the variant";
let constructor =
@@ -220,7 +218,7 @@
}
def VerifyLinalgTransformLegality :
- Pass<"iree-llvmcpu-verify-linalg-transform-legality", "ModuleOp"> {
+ InterfacePass<"iree-llvmcpu-verify-linalg-transform-legality", "mlir::FunctionOpInterface"> {
let summary = "Verify that only supported IR constructs are passed to the compiler.";
let constructor = "mlir::iree_compiler::createVerifyLinalgTransformLegalityPass()";
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/VerifyLinalgTransformLegality.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/VerifyLinalgTransformLegality.cpp
index c09a958..183b789 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/VerifyLinalgTransformLegality.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/VerifyLinalgTransformLegality.cpp
@@ -21,9 +21,9 @@
} // namespace
void VerifyLinalgTransformLegalityPass::runOnOperation() {
- auto moduleOp = getOperation();
+ auto funcOp = getOperation();
// For now only check that there are no Linalg transform markers.
- auto walkResult = moduleOp.walk([](linalg::LinalgOp op) -> WalkResult {
+ auto walkResult = funcOp.walk([](linalg::LinalgOp op) -> WalkResult {
if (op->hasAttr(LinalgTransforms::kLinalgTransformMarker)) {
return op.emitError("expected no Linalg transform markers");
}
@@ -34,7 +34,7 @@
}
}
-std::unique_ptr<OperationPass<ModuleOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createVerifyLinalgTransformLegalityPass() {
return std::make_unique<VerifyLinalgTransformLegalityPass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_dotprod_vector_lowering.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_dotprod_vector_lowering.mlir
index 93d68db..8be21d5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_dotprod_vector_lowering.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_dotprod_vector_lowering.mlir
@@ -1,22 +1,11 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmcpu-mmt4d-vector-lowering,iree-codegen-optimize-vector-transfer{flatten=true})))))' --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-llvmcpu-mmt4d-vector-lowering,iree-codegen-optimize-vector-transfer{flatten=true}))' %s | FileCheck %s
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-
-hal.executable private @foo {
-hal.executable.variant @system_elf_arm_64 target(<"llvm-cpu", "system-elf-arm_64", {cpu_features = "+dotprod", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android29"}>) {
-hal.executable.export @foo layout(#pipeline_layout)
-builtin.module attributes {llvm.data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", llvm.target_triple = "aarch64-none-linux-android29"} {
-
-func.func @mmt4d_kernel_dispatch() {
+#target = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {
+ cpu_features = "+dotprod",
+ data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128",
+ native_vector_size = 16 : index,
+ target_triple = "aarch64-none-linux-android29"}>
+func.func @mmt4d_kernel_dispatch() attributes {hal.executable.target = #target} {
%c0_i8 = arith.constant 0 : i8
%cst = arith.constant dense<0> : vector<1x1x8x8xi32>
%c2 = arith.constant 2 : index
@@ -60,10 +49,6 @@
return
}
-}
-}
-}
-
// CHECK-LABEL: @mmt4d_kernel_dispatch(
// CHECK: %[[LHS_FLAT32:.+]] = vector.transfer_read {{.*}} : memref<1x2x32xi8>, vector<32xi8>
// CHECK: %[[RHS_FLAT32:.+]] = vector.transfer_read {{.*}} : memref<1x2x32xi8>, vector<32xi8>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_vector_lowering.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_vector_lowering.mlir
index 16d81db..110d626 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_vector_lowering.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_vector_lowering.mlir
@@ -1,6 +1,6 @@
-// RUN: iree-opt %s --pass-pipeline="builtin.module(func.func(iree-llvmcpu-mmt4d-vector-lowering),iree-codegen-llvmcpu-vector-lowering-pipeline)" --split-input-file | FileCheck %s
-// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmcpu-mmt4d-vector-lowering{vector-contract-custom-kernels=false})))))" --split-input-file | FileCheck %s -check-prefix=CHECK-KERNEL-OFF
-// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmcpu-mmt4d-vector-lowering{vector-contract-custom-kernels=true})))))" --split-input-file | FileCheck %s -check-prefix=CHECK-KERNEL-ON
+// RUN: iree-opt %s --pass-pipeline="builtin.module(func.func(iree-llvmcpu-mmt4d-vector-lowering, iree-codegen-llvmcpu-vector-lowering-pipeline))" --split-input-file | FileCheck %s
+// RUN: iree-opt %s --pass-pipeline="builtin.module(func.func(iree-llvmcpu-mmt4d-vector-lowering{vector-contract-custom-kernels=false}))" --split-input-file | FileCheck %s -check-prefix=CHECK-KERNEL-OFF
+// RUN: iree-opt %s --pass-pipeline="builtin.module(func.func(iree-llvmcpu-mmt4d-vector-lowering{vector-contract-custom-kernels=true}))" --split-input-file | FileCheck %s -check-prefix=CHECK-KERNEL-ON
#map0 = affine_map<()[s0] -> (s0 * 64)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
@@ -179,43 +179,31 @@
// CHECK-KERNEL-OFF-LABEL: @simpul_mul_mixed_mini_no_custom_kernel
// CHECK-KERNEL-OFF-NOT: llvm.inline_asm asm_dialect
-hal.executable private @simpul_mul_mixed_mini_dispatch {
- hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+neon,+i8mm,+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : i64, target_triple = "aarch64-unknown-unknown-eabi-elf", ukernels = "none"}>) {
- hal.executable.export public @simpul_mul_mixed_mini_no_custom_kernel ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>], translation_info = #iree_codegen.translation_info<Mmt4dTilingExpert>} {
- ^bb0(%arg0: !hal.device):
- %c1 = arith.constant 1 : index
- hal.return %c1, %c1, %c1 : index, index, index
+#executable_target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+neon,+i8mm,+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : i64, target_triple = "aarch64-unknown-unknown-eabi-elf", ukernels = "none"}>
+#translation_info = #iree_codegen.translation_info<Mmt4dTilingExpert>
+module {
+ func.func @simpul_mul_mixed_mini_no_custom_kernel(%5 : vector<1x1x8x1xi8>, %6 : vector<1x1x8x1xi8> , %arg3 : vector<1x1x8x8xi32> ) -> vector<1x1x8x8xi32>
+ attributes { hal.executable.target = #executable_target, translation_info = #translation_info} {
+ %7 = arith.extsi %5 : vector<1x1x8x1xi8> to vector<1x1x8x1xi32>
+ %8 = arith.extsi %6 : vector<1x1x8x1xi8> to vector<1x1x8x1xi32>
+ %9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %8, %arg3 : vector<1x1x8x1xi32>, vector<1x1x8x1xi32> into vector<1x1x8x8xi32>
+ return %9 : vector<1x1x8x8xi32>
}
- builtin.module {
- func.func @simpul_mul_mixed_mini_no_custom_kernel(%5 : vector<1x1x8x1xi8>, %6 : vector<1x1x8x1xi8> , %arg3 : vector<1x1x8x8xi32> ) -> vector<1x1x8x8xi32> {
- %7 = arith.extsi %5 : vector<1x1x8x1xi8> to vector<1x1x8x1xi32>
- %8 = arith.extsi %6 : vector<1x1x8x1xi8> to vector<1x1x8x1xi32>
- %9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %8, %arg3 : vector<1x1x8x1xi32>, vector<1x1x8x1xi32> into vector<1x1x8x8xi32>
- return %9 : vector<1x1x8x8xi32>
- }
- }
- }
- }
+}
// -----
// CHECK-KERNEL-ON-LABEL: @simpul_mul_mixed_mini_custom_kernel
// CHECK-KERNEL-ON-DAG: llvm.inline_asm asm_dialect
-hal.executable private @simpul_mul_mixed_mini_dispatch {
- hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+neon,+i8mm,+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : i64, target_triple = "aarch64-unknown-unknown-eabi-elf", ukernels = "none"}>) {
- hal.executable.export public @simpul_mul_mixed_mini_custom_kernel ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>], translation_info = #iree_codegen.translation_info<Mmt4dTilingExpert>} {
- ^bb0(%arg0: !hal.device):
- %c1 = arith.constant 1 : index
- hal.return %c1, %c1, %c1 : index, index, index
+#executable_target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "+neon,+i8mm,+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : i64, target_triple = "aarch64-unknown-unknown-eabi-elf", ukernels = "none"}>
+#translation_info = #iree_codegen.translation_info<Mmt4dTilingExpert>
+module {
+ func.func @simpul_mul_mixed_mini_custom_kernel(%5 : vector<1x1x8x1xi8>, %6 : vector<1x1x8x1xi8> , %arg3 : vector<1x1x8x8xi32> ) -> vector<1x1x8x8xi32>
+ attributes { hal.executable.target = #executable_target, translation_info = #translation_info} {
+ %7 = arith.extsi %5 : vector<1x1x8x1xi8> to vector<1x1x8x1xi32>
+ %8 = arith.extsi %6 : vector<1x1x8x1xi8> to vector<1x1x8x1xi32>
+ %9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %8, %arg3 : vector<1x1x8x1xi32>, vector<1x1x8x1xi32> into vector<1x1x8x8xi32>
+ return %9 : vector<1x1x8x8xi32>
}
- builtin.module {
- func.func @simpul_mul_mixed_mini_custom_kernel(%5 : vector<1x1x8x1xi8>, %6 : vector<1x1x8x1xi8> , %arg3 : vector<1x1x8x8xi32> ) -> vector<1x1x8x8xi32> {
- %7 = arith.extsi %5 : vector<1x1x8x1xi8> to vector<1x1x8x1xi32>
- %8 = arith.extsi %6 : vector<1x1x8x1xi8> to vector<1x1x8x1xi32>
- %9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %8, %arg3 : vector<1x1x8x1xi32>, vector<1x1x8x1xi32> into vector<1x1x8x8xi32>
- return %9 : vector<1x1x8x8xi32>
- }
- }
- }
- }
+}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/check_ir_before_llvm_conversion.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/check_ir_before_llvm_conversion.mlir
index 2b12c8a..8ac1d37 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/check_ir_before_llvm_conversion.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/check_ir_before_llvm_conversion.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --iree-llvmcpu-check-ir-before-llvm-conversion %s --verify-diagnostics -split-input-file
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-check-ir-before-llvm-conversion))" %s --verify-diagnostics -split-input-file
module {
func.func @dynamic_allocas(%arg0: index) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/check_ir_before_llvm_conversion_not_fail_unbound.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/check_ir_before_llvm_conversion_not_fail_unbound.mlir
index a740db3..f40cd36 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/check_ir_before_llvm_conversion_not_fail_unbound.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/check_ir_before_llvm_conversion_not_fail_unbound.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --iree-llvmcpu-check-ir-before-llvm-conversion=fail-on-out-of-bounds=false %s --verify-diagnostics -split-input-file
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-check-ir-before-llvm-conversion{fail-on-out-of-bounds=false}))" %s --verify-diagnostics -split-input-file
module {
func.func @dynamic_allocas(%arg0: index) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir
index 98bd3e1..d7cac72 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir
@@ -1,204 +1,118 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy)))' --verify-diagnostics --split-input-file %s
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' --verify-diagnostics --split-input-file %s
#config = #iree_codegen.lowering_config<tile_sizes = []>
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64">
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @llvm target(#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {}>) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {translation_info = #translation}
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
- // expected-error @+1 {{expected four tiling levels, got 0}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
- outs(%result: memref<4x16xf32>)
- return
- }
- }
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
+ // expected-error @+1 {{expected four tiling levels, got 0}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>)
+ return
}
}
-// -----
+// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[4, 8], [8, 8, 0], [0, 0, 8], [0, 0, 0]], native_vector_size = [0, 0, 4]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @llvm target(#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {}>) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {translation_info = #translation}
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
- // expected-error @+1 {{native_vector_size must be empty}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
- outs(%result: memref<4x16xf32>)
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64">
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
+ // expected-error @+1 {{native_vector_size must be empty}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>)
+ return
}
}
-// -----
+// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [8, 32, 16], [0, 0, 16], [0, 0, 0]]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @llvm target(#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {}>) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {translation_info = #translation}
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
- // expected-error @+1 {{expected only parallel dims to be set in the second tiling level, got 2-th tile size set}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
- outs(%result: memref<4x16xf32>)
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64">
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
+ // expected-error @+1 {{expected only parallel dims to be set in the second tiling level, got 2-th tile size set}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>)
+ return
}
}
-// -----
+// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [8, 0, 0], [0, 16, 16], [0, 0, 0]]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @llvm target(#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {}>) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {translation_info = #translation}
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
- // expected-error @+1 {{only reduction dims to be set in the third tiling level, got 1-th tile size set}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
- outs(%result: memref<4x16xf32>)
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64">
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
+ // expected-error @+1 {{only reduction dims to be set in the third tiling level, got 1-th tile size set}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>)
+ return
}
}
-// -----
-#config = #iree_codegen.lowering_config<tile_sizes = [{sizes=[4, 8], interchange=[1]}, [8, 8, 0], [0, 0, 8], [0, 0, 0]]>
+// -----
+#config = #iree_codegen.lowering_config<tile_sizes = [{sizes = [4, 8], interchange = [1]}, [8, 8, 0], [0, 0, 8], [0, 0, 0]]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @llvm target(#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {}>) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {translation_info = #translation}
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
- // expected-error @+1 {{expected [0, 2) to be set exactly once in interchange #0}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
- outs(%result: memref<4x16xf32>)
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64">
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
+ // expected-error @+1 {{expected [0, 2) to be set exactly once in interchange #0}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>)
+ return
}
}
-// -----
+// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[0, 7, 7, 64, 0, 0, 0], [6, 1, 7, 32, 0, 0, 0], [0, 0, 0, 0, 3, 3, 4], [0, 0, 0, 0, 0, 0, 0]]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @conv_2d_nhwc_hwcf {
- hal.executable.variant @llvm target(#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {}>) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {translation_info = #translation}
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<36x9x9x512xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<3x3x512x512xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<36x7x7x512xf32>
- // expected-error @+1 {{can't decompose the conv op}}
- linalg.conv_2d_nhwc_hwcf {lowering_config = #config}
- ins(%lhs, %rhs : memref<36x9x9x512xf32>, memref<3x3x512x512xf32>)
- outs(%result: memref<36x7x7x512xf32>)
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64">
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<36x9x9x512xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<3x3x512x512xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<36x7x7x512xf32>
+ // expected-error @+1 {{can't decompose the conv op}}
+ linalg.conv_2d_nhwc_hwcf {lowering_config = #config} ins(%0, %1 : memref<36x9x9x512xf32>, memref<3x3x512x512xf32>) outs(%2 : memref<36x7x7x512xf32>)
+ return
}
}
-// -----
+// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 7, 64, 0, 0], [1, 1, 7, 8, 0, 0], [0, 0, 0, 0, 5, 5], [0, 0, 0, 0, 0, 0]]>
#translation = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @depthwise_conv_2d_nhwc_hwc {
- hal.executable.variant @llvm target(#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {}>) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {translation_info = #translation}
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1x11x11x576xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<5x5x576xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1x7x7x576xf32>
- // expected-error @+1 {{can't decompose the conv op}}
- linalg.depthwise_conv_2d_nhwc_hwc {lowering_config = #config, dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
- ins(%lhs, %rhs : memref<1x11x11x576xf32>, memref<5x5x576xf32>)
- outs(%result: memref<1x7x7x576xf32>)
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64">
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1x11x11x576xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<5x5x576xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1x7x7x576xf32>
+ // expected-error @+1 {{can't decompose the conv op}}
+ linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #config, strides = dense<1> : tensor<2xi64>} ins(%0, %1 : memref<1x11x11x576xf32>, memref<5x5x576xf32>) outs(%2 : memref<1x7x7x576xf32>)
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pack_unpack_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pack_unpack_tests.mlir
index 02ec2cb..19552fa 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pack_unpack_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pack_unpack_tests.mlir
@@ -1,40 +1,33 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy, iree-llvmcpu-lower-executable-target)))' --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --split-input-file %s | FileCheck %s
-// This file includes all the tests related to pack, unpack, and fusion tests.
-
-hal.executable private @aligned_generic_pack {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>) {
- hal.executable.export public @aligned_generic_pack ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @aligned_generic_pack() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 3.40282347E+38 : f32
- %cst_0 = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<384x512xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<24x512x16x1xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [512], strides = [1] : !flow.dispatch.tensor<readonly:tensor<512xf32>> -> tensor<512xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x512xf32>> -> tensor<384x512xf32>
- %5 = tensor.empty() : tensor<24x512x16x1xf32>
- %6 = tensor.empty() : tensor<384x512xf32>
- %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%3, %4 : tensor<512xf32>, tensor<384x512xf32>) outs(%6 : tensor<384x512xf32>) {
- ^bb0(%in: f32, %in_1: f32, %out: f32):
- %8 = arith.addf %in, %in_1 : f32
- %9 = arith.minimumf %8, %cst : f32
- %10 = arith.maximumf %9, %cst_0 : f32
- linalg.yield %10 : f32
- } -> tensor<384x512xf32>
- %pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %5 : tensor<384x512xf32> -> tensor<24x512x16x1xf32>
- flow.dispatch.tensor.store %pack, %2, offsets = [0, 0, 0, 0], sizes = [24, 512, 16, 1], strides = [1, 1, 1, 1] : tensor<24x512x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<24x512x16x1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d1)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @aligned_generic_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 3.40282347E+38 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<384x512xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<24x512x16x1xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [512], strides = [1] : !flow.dispatch.tensor<readonly:tensor<512xf32>> -> tensor<512xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x512xf32>> -> tensor<384x512xf32>
+ %5 = tensor.empty() : tensor<24x512x16x1xf32>
+ %6 = tensor.empty() : tensor<384x512xf32>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%3, %4 : tensor<512xf32>, tensor<384x512xf32>) outs(%6 : tensor<384x512xf32>) {
+ ^bb0(%in: f32, %in_1: f32, %out: f32):
+ %8 = arith.addf %in, %in_1 : f32
+ %9 = arith.minimumf %8, %cst : f32
+ %10 = arith.maximumf %9, %cst_0 : f32
+ linalg.yield %10 : f32
+ } -> tensor<384x512xf32>
+ %pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %5 : tensor<384x512xf32> -> tensor<24x512x16x1xf32>
+ flow.dispatch.tensor.store %pack, %2, offsets = [0, 0, 0, 0], sizes = [24, 512, 16, 1], strides = [1, 1, 1, 1] : tensor<24x512x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<24x512x16x1xf32>>
+ return
}
}
+
// CHECK-LABEL: func.func @aligned_generic_pack
// CHECK: %[[IN_0:.+]] = vector.broadcast %{{.+}} : vector<16xf32> to vector<16x16xf32>
// CHECK-COUNT-15: %{{.+}} = vector.insert {{.+}} : vector<16xf32> into vector<16x16xf32>
@@ -47,38 +40,33 @@
// -----
-hal.executable private @aligned_unpack_generic {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>) {
- hal.executable.export public @aligned_unpack_generic ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @aligned_unpack_generic() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 3.40282347E+38 : f32
- %cst_0 = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<24x32x16x16xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<384x512xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [24, 32, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<24x32x16x16xf32>> -> tensor<24x32x16x16xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [512], strides = [1] : !flow.dispatch.tensor<readonly:tensor<512xf32>> -> tensor<512xf32>
- %5 = tensor.empty() : tensor<384x512xf32>
- %unpack = tensor.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %5 : tensor<24x32x16x16xf32> -> tensor<384x512xf32>
- %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4, %unpack : tensor<512xf32>, tensor<384x512xf32>) outs(%5 : tensor<384x512xf32>) {
- ^bb0(%in: f32, %in_1: f32, %out: f32):
- %7 = arith.addf %in, %in_1 : f32
- %8 = arith.minimumf %7, %cst : f32
- %9 = arith.maximumf %8, %cst_0 : f32
- linalg.yield %9 : f32
- } -> tensor<384x512xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : tensor<384x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<384x512xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d1)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @aligned_unpack_generic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 3.40282347E+38 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<24x32x16x16xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<384x512xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [24, 32, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<24x32x16x16xf32>> -> tensor<24x32x16x16xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [512], strides = [1] : !flow.dispatch.tensor<readonly:tensor<512xf32>> -> tensor<512xf32>
+ %5 = tensor.empty() : tensor<384x512xf32>
+ %unpack = tensor.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %5 : tensor<24x32x16x16xf32> -> tensor<384x512xf32>
+ %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%4, %unpack : tensor<512xf32>, tensor<384x512xf32>) outs(%5 : tensor<384x512xf32>) {
+ ^bb0(%in: f32, %in_1: f32, %out: f32):
+ %7 = arith.addf %in, %in_1 : f32
+ %8 = arith.minimumf %7, %cst : f32
+ %9 = arith.maximumf %8, %cst_0 : f32
+ linalg.yield %9 : f32
+ } -> tensor<384x512xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : tensor<384x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<384x512xf32>>
+ return
}
}
+
// CHECK-LABEL: func.func @aligned_unpack_generic
// CHECK: %[[SRC:.+]] = hal.interface.binding.subspan {{.*}} : memref<24x32x16x16xf32, #hal.descriptor_type<storage_buffer>>
// CHECK: %[[SUBVIEW:.+]] = memref.subview %{{.*}} memref<24x32x16x16xf32, #hal.descriptor_type<storage_buffer>> to memref<
@@ -92,39 +80,21 @@
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @unaligned_pack {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {
- cpu_features = "+avx512f",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 64 : index,
- target_triple = "x86_64-none-elf"
- }>) {
- hal.executable.export public @unaligned_pack layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @unaligned_pack() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<20x40xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x48x16x1xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [20, 40], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<20x40xf32>> -> tensor<20x40xf32>
- %3 = tensor.empty() : tensor<2x48x16x1xf32>
- %4 = tensor.pack %2 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %3 : tensor<20x40xf32> -> tensor<2x48x16x1xf32>
- flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [2, 48, 16, 1], strides = [1, 1, 1, 1] : tensor<2x48x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x48x16x1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @unaligned_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<20x40xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x48x16x1xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [20, 40], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<20x40xf32>> -> tensor<20x40xf32>
+ %3 = tensor.empty() : tensor<2x48x16x1xf32>
+ %pack = tensor.pack %2 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %3 : tensor<20x40xf32> -> tensor<2x48x16x1xf32>
+ flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [2, 48, 16, 1], strides = [1, 1, 1, 1] : tensor<2x48x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x48x16x1xf32>>
+ return
}
}
+
// CHECK-LABEL: func.func @unaligned_pack
// CHECK-COUNT-16: vector.maskedload {{.+}} vector<16xf32>
// CHECK-COUNT-64: vector.shuffle
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_conv_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_conv_tests.mlir
index bd29a98..c104dad 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_conv_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_conv_tests.mlir
@@ -1,58 +1,45 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy, iree-llvmcpu-lower-executable-target)))' --iree-llvmcpu-enable-pad-consumer-fusion --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --iree-llvmcpu-enable-pad-consumer-fusion --split-input-file %s | FileCheck %s
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}>
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 5, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
module {
- hal.executable private @pad_conv_2d {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @pad_conv_2d_nchw_fchw_1x320x64x64x320x3x3 ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @pad_conv_2d_nchw_fchw_1x320x64x64x320x3x3() {
- %cst = arith.constant 0.000000e+00 : f32
- %c1 = arith.constant 1 : index
- %c0 = arith.constant 0 : index
- %c5243520 = arith.constant 5243520 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = hal.interface.constant.load[4] : i32
- %5 = arith.index_castui %0 {stream.alignment = 128 : index, stream.values = [10486400 : index, 15729280 : index]} : i32 to index
- %6 = arith.index_castui %1 {stream.alignment = 256 : index, stream.values = [1273222400 : index, 1280618240 : index]} : i32 to index
- %7 = arith.index_castui %2 {stream.alignment = 256 : index, stream.values = [10507520 : index, 21488640 : index]} : i32 to index
- %8 = arith.index_castui %3 {stream.alignment = 256 : index, stream.values = [10508800 : index, 21489920 : index]} : i32 to index
- %9 = arith.index_castui %4 {stream.alignment = 128 : index, stream.values = [10486400 : index, 10487680 : index]} : i32 to index
- %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c5243520) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x320x64x64xf32>>
- %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<320x320x3x3xf32>>
- %12 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x320xf32>>
- %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x320xf32>>
- %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x320xf32>>
- %15 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%9) : !flow.dispatch.tensor<writeonly:tensor<1x320x64x64xf32>>
- %16 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0, 0], sizes = [1, 320, 64, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x320x64x64xf32>> -> tensor<1x320x64x64xf32>
- %17 = flow.dispatch.tensor.load %11, offsets = [0, 0, 0, 0], sizes = [320, 320, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<320x320x3x3xf32>> -> tensor<320x320x3x3xf32>
- %18 = flow.dispatch.tensor.load %12, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x320xf32>> -> tensor<1x320xf32>
- %19 = flow.dispatch.tensor.load %13, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x320xf32>> -> tensor<1x320xf32>
- %20 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x320xf32>> -> tensor<1x320xf32>
- %21 = tensor.empty() : tensor<1x320x64x64xf32>
- %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<1x320x64x64xf32>) -> tensor<1x320x64x64xf32>
- %padded = tensor.pad %16 low[0, 0, 1, 1] high[0, 0, 1, 1] {
- ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
- tensor.yield %cst : f32
- } : tensor<1x320x64x64xf32> to tensor<1x320x66x66xf32>
- %23 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded, %17 : tensor<1x320x66x66xf32>, tensor<320x320x3x3xf32>) outs(%22 : tensor<1x320x64x64xf32>) -> tensor<1x320x64x64xf32>
- flow.dispatch.tensor.store %23, %15, offsets = [0, 0, 0, 0], sizes = [1, 320, 64, 64], strides = [1, 1, 1, 1] : tensor<1x320x64x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x320x64x64xf32>>
- return
- }
- }
- }
+ func.func @pad_conv_2d_nchw_fchw_1x320x64x64x320x3x3() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c1 = arith.constant 1 : index
+ %c0 = arith.constant 0 : index
+ %c5243520 = arith.constant 5243520 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = hal.interface.constant.load[4] : i32
+ %5 = arith.index_castui %0 {stream.alignment = 128 : index, stream.values = [10486400 : index, 15729280 : index]} : i32 to index
+ %6 = arith.index_castui %1 {stream.alignment = 256 : index, stream.values = [1273222400 : index, 1280618240 : index]} : i32 to index
+ %7 = arith.index_castui %2 {stream.alignment = 256 : index, stream.values = [10507520 : index, 21488640 : index]} : i32 to index
+ %8 = arith.index_castui %3 {stream.alignment = 256 : index, stream.values = [10508800 : index, 21489920 : index]} : i32 to index
+ %9 = arith.index_castui %4 {stream.alignment = 128 : index, stream.values = [10486400 : index, 10487680 : index]} : i32 to index
+ %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c5243520) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x320x64x64xf32>>
+ %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<320x320x3x3xf32>>
+ %12 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x320xf32>>
+ %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x320xf32>>
+ %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x320xf32>>
+ %15 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%9) : !flow.dispatch.tensor<writeonly:tensor<1x320x64x64xf32>>
+ %16 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0, 0], sizes = [1, 320, 64, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x320x64x64xf32>> -> tensor<1x320x64x64xf32>
+ %17 = flow.dispatch.tensor.load %11, offsets = [0, 0, 0, 0], sizes = [320, 320, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<320x320x3x3xf32>> -> tensor<320x320x3x3xf32>
+ %18 = flow.dispatch.tensor.load %12, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x320xf32>> -> tensor<1x320xf32>
+ %19 = flow.dispatch.tensor.load %13, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x320xf32>> -> tensor<1x320xf32>
+ %20 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x320xf32>> -> tensor<1x320xf32>
+ %21 = tensor.empty() : tensor<1x320x64x64xf32>
+ %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<1x320x64x64xf32>) -> tensor<1x320x64x64xf32>
+ %padded = tensor.pad %16 low[0, 0, 1, 1] high[0, 0, 1, 1] {
+ ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
+ tensor.yield %cst : f32
+ } : tensor<1x320x64x64xf32> to tensor<1x320x66x66xf32>
+ %23 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded, %17 : tensor<1x320x66x66xf32>, tensor<320x320x3x3xf32>) outs(%22 : tensor<1x320x64x64xf32>) -> tensor<1x320x64x64xf32>
+ flow.dispatch.tensor.store %23, %15, offsets = [0, 0, 0, 0], sizes = [1, 320, 64, 64], strides = [1, 1, 1, 1] : tensor<1x320x64x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x320x64x64xf32>>
+ return
}
}
+
// CHECK-LABEL: func.func @pad_conv_2d_nchw_fchw_1x320x64x64x320x3x3
//
// Check that the stack buffer is bounded by tiling sizes.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir
index db115e9..9e80837 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir
@@ -1,39 +1,23 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy, iree-llvmcpu-lower-executable-target)))" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))" --split-input-file %s | FileCheck %s
-hal.executable private @pad_only {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {
- cpu = "generic", cpu_features = "",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>) {
- hal.executable.export public @pad_only_dispatch ordinal(0)
- layout(#hal.pipeline.layout<push_constants = 0,
- sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @pad_only_dispatch() {
- %c634816 = arith.constant 634816 : index
- %c3846080 = arith.constant 3846080 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c634816) flags(ReadOnly)
- : !flow.dispatch.tensor<readonly:tensor<1x112x112x64xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c3846080)
- : !flow.dispatch.tensor<writeonly:tensor<1x114x114x64xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 64], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x112x112x64xf32>> -> tensor<1x112x112x64xf32>
- %padded = tensor.pad %2 low[0, 1, 1, 0] high[0, 1, 1, 0] {
- ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
- tensor.yield %cst : f32
- } : tensor<1x112x112x64xf32> to tensor<1x114x114x64xf32>
- flow.dispatch.tensor.store %padded, %1, offsets = [0, 0, 0, 0], sizes = [1, 114, 114, 64], strides = [1, 1, 1, 1]
- : tensor<1x114x114x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x114x114x64xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @pad_only_dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c634816 = arith.constant 634816 : index
+ %c3846080 = arith.constant 3846080 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c634816) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x112x112x64xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c3846080) : !flow.dispatch.tensor<writeonly:tensor<1x114x114x64xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x112x112x64xf32>> -> tensor<1x112x112x64xf32>
+ %padded = tensor.pad %2 low[0, 1, 1, 0] high[0, 1, 1, 0] {
+ ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
+ tensor.yield %cst : f32
+ } : tensor<1x112x112x64xf32> to tensor<1x114x114x64xf32>
+ flow.dispatch.tensor.store %padded, %1, offsets = [0, 0, 0, 0], sizes = [1, 114, 114, 64], strides = [1, 1, 1, 1] : tensor<1x114x114x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x114x114x64xf32>>
+ return
}
}
+
// CHECK-LABEL: func @pad_only_dispatch()
// CHECK: %[[INPUT:.+]] = hal.interface.binding.subspan {{.+}} : memref<1x112x112x64xf32
// CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan {{.+}} : memref<1x114x114x64xf32
@@ -58,63 +42,41 @@
// CHECK: vector.store %[[RESULT_VEC]], %[[DROP_UNIT_OUTPUT_SLICE]]
// -----
-
-hal.executable private @pad_with_producer {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {
- cpu = "generic", cpu_features = "",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>) {
- hal.executable.export public @pad_with_producer_dispatch ordinal(0)
- layout(#hal.pipeline.layout<push_constants = 0,
- sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @pad_with_producer_dispatch() {
- %c802816 = arith.constant 802816 : index
- %c72545728 = arith.constant 72545728 : index
- %c72676800 = arith.constant 72676800 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 1.001000e-05 : f32
- %cst_0 = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c802816) flags(ReadOnly)
- : !flow.dispatch.tensor<readonly:tensor<1x56x56x256xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c72545728) flags(ReadOnly)
- : !flow.dispatch.tensor<readonly:tensor<1x1x256x128xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c72676800) flags(ReadOnly)
- : !flow.dispatch.tensor<readonly:tensor<128xf32>>
- %7 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0)
- : !flow.dispatch.tensor<writeonly:tensor<1x30x30x128xf32>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 56, 56, 256], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x56x56x256xf32>> -> tensor<1x56x56x256xf32>
- %9 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1, 1, 256, 128], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x1x256x128xf32>> -> tensor<1x1x256x128xf32>
- %10 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xf32>> -> tensor<128xf32>
- %15 = tensor.empty() : tensor<1x28x28x128xf32>
- %16 = linalg.fill ins(%cst_0 : f32) outs(%15 : tensor<1x28x28x128xf32>) -> tensor<1x28x28x128xf32>
- %17 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%8, %9 : tensor<1x56x56x256xf32>, tensor<1x1x256x128xf32>) outs(%16 : tensor<1x28x28x128xf32>) -> tensor<1x28x28x128xf32>
- %18 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
- iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
- ins(%17, %10 : tensor<1x28x28x128xf32>, tensor<128xf32>) outs(%15 : tensor<1x28x28x128xf32>) {
- ^bb0(%in: f32, %in_1: f32, %out: f32):
- %20 = arith.addf %in, %in_1 : f32
- linalg.yield %20 : f32
- } -> tensor<1x28x28x128xf32>
- %padded = tensor.pad %18 low[0, 1, 1, 0] high[0, 1, 1, 0] {
- ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
- tensor.yield %cst_0 : f32
- } : tensor<1x28x28x128xf32> to tensor<1x30x30x128xf32>
- flow.dispatch.tensor.store %padded, %7, offsets = [0, 0, 0, 0], sizes = [1, 30, 30, 128], strides = [1, 1, 1, 1]
- : tensor<1x30x30x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x30x30x128xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d3)>
+module {
+ func.func @pad_with_producer_dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c802816 = arith.constant 802816 : index
+ %c72545728 = arith.constant 72545728 : index
+ %c72676800 = arith.constant 72676800 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 1.001000e-05 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c802816) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x56x56x256xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c72545728) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x256x128xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c72676800) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x30x30x128xf32>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 56, 56, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x56x56x256xf32>> -> tensor<1x56x56x256xf32>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1, 1, 256, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x256x128xf32>> -> tensor<1x1x256x128xf32>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xf32>> -> tensor<128xf32>
+ %7 = tensor.empty() : tensor<1x28x28x128xf32>
+ %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<1x28x28x128xf32>) -> tensor<1x28x28x128xf32>
+ %9 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%4, %5 : tensor<1x56x56x256xf32>, tensor<1x1x256x128xf32>) outs(%8 : tensor<1x28x28x128xf32>) -> tensor<1x28x28x128xf32>
+ %10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %6 : tensor<1x28x28x128xf32>, tensor<128xf32>) outs(%7 : tensor<1x28x28x128xf32>) {
+ ^bb0(%in: f32, %in_1: f32, %out: f32):
+ %11 = arith.addf %in, %in_1 : f32
+ linalg.yield %11 : f32
+ } -> tensor<1x28x28x128xf32>
+ %padded = tensor.pad %10 low[0, 1, 1, 0] high[0, 1, 1, 0] {
+ ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
+ tensor.yield %cst_0 : f32
+ } : tensor<1x28x28x128xf32> to tensor<1x30x30x128xf32>
+ flow.dispatch.tensor.store %padded, %3, offsets = [0, 0, 0, 0], sizes = [1, 30, 30, 128], strides = [1, 1, 1, 1] : tensor<1x30x30x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x30x30x128xf32>>
+ return
}
}
+
// CHECK-LABEL: func @pad_with_producer_dispatch()
// CHECK: %[[INPUT:.+]] = hal.interface.binding.subspan {{.+}} : memref<1x56x56x256xf32
// CHECK: %[[FILTER:.+]] = hal.interface.binding.subspan {{.+}} : memref<1x1x256x128xf32
@@ -155,48 +117,27 @@
// CHECK-SAME: outs(%[[INTERIOR_SLICE]] :
// -----
-
-hal.executable private @pad_consumer_fusion {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {
- cpu = "generic", cpu_features = "",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>) {
- hal.executable.export public @pad_consumer_fusion_dispatch ordinal(0)
- layout(#hal.pipeline.layout<push_constants = 0,
- sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @pad_consumer_fusion_dispatch() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly)
- : !flow.dispatch.tensor<readonly:tensor<1x14x14x256xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly)
- : !flow.dispatch.tensor<readonly:tensor<3x3x256x256xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0)
- : !flow.dispatch.tensor<readwrite:tensor<1x14x14x256xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x14x14x256xf32>> -> tensor<1x14x14x256xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 256, 256], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x256x256xf32>> -> tensor<3x3x256x256xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readwrite:tensor<1x14x14x256xf32>> -> tensor<1x14x14x256xf32>
- %padded = tensor.pad %3 low[0, 1, 1, 0] high[0, 1, 1, 0] {
- ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
- tensor.yield %cst : f32
- } : tensor<1x14x14x256xf32> to tensor<1x16x16x256xf32>
- %6 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
- ins(%padded, %4 : tensor<1x16x16x256xf32>, tensor<3x3x256x256xf32>) outs(%5 : tensor<1x14x14x256xf32>) -> tensor<1x14x14x256xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1]
- : tensor<1x14x14x256xf32> -> !flow.dispatch.tensor<readwrite:tensor<1x14x14x256xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @pad_consumer_fusion_dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x14x14x256xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3x3x256x256xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1x14x14x256xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x14x14x256xf32>> -> tensor<1x14x14x256xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 256, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x256x256xf32>> -> tensor<3x3x256x256xf32>
+ %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x14x14x256xf32>> -> tensor<1x14x14x256xf32>
+ %padded = tensor.pad %3 low[0, 1, 1, 0] high[0, 1, 1, 0] {
+ ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
+ tensor.yield %cst : f32
+ } : tensor<1x14x14x256xf32> to tensor<1x16x16x256xf32>
+ %6 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%padded, %4 : tensor<1x16x16x256xf32>, tensor<3x3x256x256xf32>) outs(%5 : tensor<1x14x14x256xf32>) -> tensor<1x14x14x256xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1] : tensor<1x14x14x256xf32> -> !flow.dispatch.tensor<readwrite:tensor<1x14x14x256xf32>>
+ return
}
}
+
// CHECK-LABEL: func @pad_consumer_fusion_dispatch()
// CHECK: %[[INPUT:.+]] = hal.interface.binding.subspan {{.+}} : memref<1x14x14x256xf32, #hal.descriptor_type<storage_buffer>>
// CHECK: %[[FILTER:.+]] = hal.interface.binding.subspan {{.+}} : memref<3x3x256x256xf32, #hal.descriptor_type<storage_buffer>>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_peel_and_vectorize_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_peel_and_vectorize_tests.mlir
index 9c91402..ee22e3a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_peel_and_vectorize_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_peel_and_vectorize_tests.mlir
@@ -1,53 +1,24 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target)))' -split-input-file %s | FileCheck %s
-
-// Test peeling + vectorization using CPUDoubleTilingPeelingExpert. The tests in
-// this file are expected to preset lowering_config and translation_info on
-// dispatches.
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-llvmcpu-lower-executable-target))' -split-input-file %s | FileCheck %s
#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @preset_config_matmul {
- hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64">) {
- hal.executable.export @no_peel_static_matmul layout(#pipeline_layout) attributes {translation_info = #translation} {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @no_peel_static_matmul() {
- %cst = arith.constant 0.000000e+00 : f32
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<128x64xf32>>
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<64x512xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [128, 64], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<128x64xf32>> -> tensor<128x64xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [64, 512], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<64x512xf32>> -> tensor<64x512xf32>
- %init = tensor.empty() : tensor<128x512xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<128x512xf32>) -> tensor<128x512xf32>
- %gemm = linalg.matmul {lowering_config = #config}
- ins(%lhs, %rhs : tensor<128x64xf32>, tensor<64x512xf32>)
- outs(%fill : tensor<128x512xf32>) -> tensor<128x512xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [128, 512], strides = [1, 1]
- : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64">
+module {
+ func.func @no_peel_static_matmul() attributes {hal.executable.target = #executable_target_system_elf_x86_64_, translation_info = #translation} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x64xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x512xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x64xf32>> -> tensor<128x64xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [64, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x512xf32>> -> tensor<64x512xf32>
+ %5 = tensor.empty() : tensor<128x512xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x512xf32>) -> tensor<128x512xf32>
+ %7 = linalg.matmul {lowering_config = #config} ins(%3, %4 : tensor<128x64xf32>, tensor<64x512xf32>) outs(%6 : tensor<128x512xf32>) -> tensor<128x512xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
+ return
}
}
-// No peel loop should be generated since dims are multiple of the vector dims.
-
// CHECK-LABEL: func @no_peel_static_matmul()
// Vectorization:
// CHECK: scf.for
@@ -57,51 +28,25 @@
// CHECK-NOT: scf.for
// -----
-
#config = #iree_codegen.lowering_config<tile_sizes = [[65, 65, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @preset_config_matmul {
- hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64">) {
- hal.executable.export @peel_static_matmul layout(#pipeline_layout) attributes {translation_info = #translation} {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @peel_static_matmul() {
- %cst = arith.constant 0.000000e+00 : f32
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<128x49xf32>>
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<49x512xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [128, 49], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<128x49xf32>> -> tensor<128x49xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [49, 512], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<49x512xf32>> -> tensor<49x512xf32>
- %init = tensor.empty() : tensor<128x512xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<128x512xf32>) -> tensor<128x512xf32>
- %gemm = linalg.matmul {lowering_config = #config}
- ins(%lhs, %rhs : tensor<128x49xf32>, tensor<49x512xf32>)
- outs(%fill : tensor<128x512xf32>) -> tensor<128x512xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [128, 512], strides = [1, 1]
- : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64">
+module {
+ func.func @peel_static_matmul() attributes {hal.executable.target = #executable_target_system_elf_x86_64_, translation_info = #translation} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x49xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<49x512xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x49xf32>> -> tensor<128x49xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [49, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<49x512xf32>> -> tensor<49x512xf32>
+ %5 = tensor.empty() : tensor<128x512xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x512xf32>) -> tensor<128x512xf32>
+ %7 = linalg.matmul {lowering_config = #config} ins(%3, %4 : tensor<128x49xf32>, tensor<49x512xf32>) outs(%6 : tensor<128x512xf32>) -> tensor<128x512xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
+ return
}
}
-// Peel loops should be generated for the 2nd and 3rd dims they are not multiple of the vector dims.
-
// CHECK-LABEL: func @peel_static_matmul()
// Vectorization:
// CHECK: scf.for
@@ -123,57 +68,31 @@
// CHECK-NOT: scf.for
// -----
-
#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @preset_config_matmul {
- hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64">) {
- hal.executable.export @peel_dynamic_matmul layout(#pipeline_layout) attributes {translation_info = #translation} {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @peel_dynamic_matmul() {
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %dim0 = arith.index_cast %0 : i32 to index
- %dim1 = arith.index_cast %1 : i32 to index
- %dim2 = arith.index_cast %2 : i32 to index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%dim1, %dim0}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%dim0, %dim2}
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%dim1, %dim2}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%dim1, %dim0], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%dim1, %dim0} -> tensor<?x?xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%dim0, %dim2], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%dim0, %dim2} -> tensor<?x?xf32>
- %init = tensor.empty(%dim1, %dim2) : tensor<?x?xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
- %gemm = linalg.matmul {lowering_config = #config}
- ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
- outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [%dim1, %dim2], strides = [1, 1]
- : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%dim1, %dim2}
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64">
+module {
+ func.func @peel_dynamic_matmul() attributes {hal.executable.target = #executable_target_system_elf_x86_64_, translation_info = #translation} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = arith.index_cast %0 : i32 to index
+ %4 = arith.index_cast %1 : i32 to index
+ %5 = arith.index_cast %2 : i32 to index
+ %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%4, %3}
+ %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%3, %5}
+ %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%4, %5}
+ %9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%4, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%4, %3} -> tensor<?x?xf32>
+ %10 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%3, %5], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%3, %5} -> tensor<?x?xf32>
+ %11 = tensor.empty(%4, %5) : tensor<?x?xf32>
+ %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ %13 = linalg.matmul {lowering_config = #config} ins(%9, %10 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%12 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %13, %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%4, %5}
+ return
}
}
-// Peel loops should be generated for all the dims since they are dynamic.
-
// CHECK-LABEL: func @peel_dynamic_matmul()
// Distribution:
// CHECK: scf.for
@@ -203,57 +122,28 @@
// CHECK-NOT: scf.for
// -----
-
#config = #iree_codegen.lowering_config<tile_sizes = [[0, 0, 0], [8, [32], 0], [0, 0, 1], [0, 0, 0]]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @preset_config_matmul {
- hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "embedded-elf-arm_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- cpu_features = "+sve",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
- }>) {
- hal.executable.export @peel_scalable_matmul layout(#pipeline_layout) attributes {translation_info = #translation} {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @peel_scalable_matmul() {
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %dim0 = arith.index_cast %0 : i32 to index
- %dim1 = arith.index_cast %1 : i32 to index
- %dim2 = arith.index_cast %2 : i32 to index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%dim1, %dim0}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%dim0, %dim2}
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%dim1, %dim2}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%dim1, %dim0], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%dim1, %dim0} -> tensor<?x?xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%dim0, %dim2], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%dim0, %dim2} -> tensor<?x?xf32>
- %init = tensor.empty(%dim1, %dim2) : tensor<?x?xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
- %gemm = linalg.matmul {lowering_config = #config}
- ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
- outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [%dim1, %dim2], strides = [1, 1]
- : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%dim1, %dim2}
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @peel_scalable_matmul() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_, translation_info = #translation} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = arith.index_cast %0 : i32 to index
+ %4 = arith.index_cast %1 : i32 to index
+ %5 = arith.index_cast %2 : i32 to index
+ %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%4, %3}
+ %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%3, %5}
+ %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%4, %5}
+ %9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%4, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%4, %3} -> tensor<?x?xf32>
+ %10 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%3, %5], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%3, %5} -> tensor<?x?xf32>
+ %11 = tensor.empty(%4, %5) : tensor<?x?xf32>
+ %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ %13 = linalg.matmul {lowering_config = #config} ins(%9, %10 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%12 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %13, %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%4, %5}
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_split_reduction_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_split_reduction_tests.mlir
index a512d77..63ab24f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_split_reduction_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_split_reduction_tests.mlir
@@ -1,41 +1,31 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy, iree-llvmcpu-lower-executable-target)))' --iree-llvmcpu-reassociate-fp-reductions=false --split-input-file %s | FileCheck %s
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy, iree-llvmcpu-lower-executable-target)))' --iree-llvmcpu-reassociate-fp-reductions=true --split-input-file %s | FileCheck %s --check-prefix=REORDERCHECK
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --iree-llvmcpu-reassociate-fp-reductions=false --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --iree-llvmcpu-reassociate-fp-reductions=true --split-input-file %s | FileCheck %s --check-prefix=REORDERCHECK
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
-hal.executable private @split_reduction_pass1_dispatch_0 {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_supported ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_supported() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant dense<0> : tensor<1024x512xi32>
- %c1_i32 = arith.constant 1 : i32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x512x256xi32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xi32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512x256xi32>> -> tensor<1024x512x256xi32>
- %3 = tensor.empty() : tensor<1024x512xi32>
- %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x256xi32>) outs(%cst : tensor<1024x512xi32>) {
- ^bb0(%arg0: i32, %arg1: i32):
- %6 = arith.addi %arg0, %arg1 : i32
- linalg.yield %6 : i32
- } -> tensor<1024x512xi32>
- %5 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<1024x512xi32>) outs(%3 : tensor<1024x512xi32>) {
- ^bb0(%arg0: i32, %arg1: i32):
- %6 = arith.addi %arg0, %c1_i32 : i32
- linalg.yield %6 : i32
- } -> tensor<1024x512xi32>
- flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xi32>>
- return
- }
- }
+module {
+ func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_supported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant dense<0> : tensor<1024x512xi32>
+ %c1_i32 = arith.constant 1 : i32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x512x256xi32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xi32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512x256xi32>> -> tensor<1024x512x256xi32>
+ %3 = tensor.empty() : tensor<1024x512xi32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x256xi32>) outs(%cst : tensor<1024x512xi32>) {
+ ^bb0(%in: i32, %out: i32):
+ %6 = arith.addi %in, %out : i32
+ linalg.yield %6 : i32
+ } -> tensor<1024x512xi32>
+ %5 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<1024x512xi32>) outs(%3 : tensor<1024x512xi32>) {
+ ^bb0(%in: i32, %out: i32):
+ %6 = arith.addi %in, %c1_i32 : i32
+ linalg.yield %6 : i32
+ } -> tensor<1024x512xi32>
+ flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xi32>>
+ return
}
}
@@ -53,42 +43,31 @@
// CHECK: arith.addi %{{.+}}, %{{.+}} : vector<4xi32>
// -----
-
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
-hal.executable private @split_reduction_pass1_dispatch_0 {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant dense<0.0> : tensor<1024x512xf32>
- %c1_f32 = arith.constant 1.0 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x512x256xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512x256xf32>> -> tensor<1024x512x256xf32>
- %3 = tensor.empty() : tensor<1024x512xf32>
- %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x256xf32>) outs(%cst : tensor<1024x512xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %6 = arith.addf %arg0, %arg1 : f32
- linalg.yield %6 : f32
- } -> tensor<1024x512xf32>
- %5 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<1024x512xf32>) outs(%3 : tensor<1024x512xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %6 = arith.addf %arg0, %c1_f32 : f32
- linalg.yield %6 : f32
- } -> tensor<1024x512xf32>
- flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
- return
- }
- }
+module {
+ func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant dense<0.000000e+00> : tensor<1024x512xf32>
+ %cst_0 = arith.constant 1.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x512x256xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512x256xf32>> -> tensor<1024x512x256xf32>
+ %3 = tensor.empty() : tensor<1024x512xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x256xf32>) outs(%cst : tensor<1024x512xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %6 = arith.addf %in, %out : f32
+ linalg.yield %6 : f32
+ } -> tensor<1024x512xf32>
+ %5 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<1024x512xf32>) outs(%3 : tensor<1024x512xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %6 = arith.addf %in, %cst_0 : f32
+ linalg.yield %6 : f32
+ } -> tensor<1024x512xf32>
+ flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
+ return
}
}
@@ -109,38 +88,27 @@
// REORDERCHECK: arith.addf %{{.+}}, %{{.+}} : vector<4xf32>
// -----
-
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 1, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
-hal.executable private @split_reduction_pass2_dispatch_0 {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @split_reduction_innermost_reduction_next_dynamic_supported ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @split_reduction_innermost_reduction_next_dynamic_supported() {
- %c0_i32 = arith.constant 0 : i32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = arith.index_castui %0 : i32 to index
- %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x?x256xi32>>{%1}
- %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x?xi32>>{%1}
- %4 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [1024, %1, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x?x256xi32>>{%1} -> tensor<1024x?x256xi32>
- %5 = tensor.empty(%1) : tensor<1024x?xi32>
- %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x?xi32>) -> tensor<1024x?xi32>
- %7 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4 : tensor<1024x?x256xi32>) outs(%6 : tensor<1024x?xi32>) {
- ^bb0(%arg0: i32, %arg1: i32):
- %8 = arith.addi %arg0, %arg1 : i32
- linalg.yield %8 : i32
- } -> tensor<1024x?xi32>
- flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [1024, %1], strides = [1, 1] : tensor<1024x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x?xi32>>{%1}
- return
- }
- }
+module {
+ func.func @split_reduction_innermost_reduction_next_dynamic_supported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0_i32 = arith.constant 0 : i32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = arith.index_castui %0 : i32 to index
+ %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x?x256xi32>>{%1}
+ %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x?xi32>>{%1}
+ %4 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [1024, %1, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x?x256xi32>>{%1} -> tensor<1024x?x256xi32>
+ %5 = tensor.empty(%1) : tensor<1024x?xi32>
+ %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x?xi32>) -> tensor<1024x?xi32>
+ %7 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4 : tensor<1024x?x256xi32>) outs(%6 : tensor<1024x?xi32>) {
+ ^bb0(%in: i32, %out: i32):
+ %8 = arith.addi %in, %out : i32
+ linalg.yield %8 : i32
+ } -> tensor<1024x?xi32>
+ flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [1024, %1], strides = [1, 1] : tensor<1024x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x?xi32>>{%1}
+ return
}
}
@@ -157,34 +125,23 @@
// CHECK: vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
// -----
-
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
-hal.executable private @split_reduction_pass3_dispatch_0 {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @split_reduction_innermost_reduction_next_imperfect_tiling_supported ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @split_reduction_innermost_reduction_next_imperfect_tiling_supported() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant dense<0> : tensor<1024x513xi32>
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x513x256xi32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x513xi32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 513, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x513x256xi32>> -> tensor<1024x513x256xi32>
- %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x513x256xi32>) outs(%cst : tensor<1024x513xi32>) {
- ^bb0(%arg0: i32, %arg1: i32):
- %4 = arith.addi %arg0, %arg1 : i32
- linalg.yield %4 : i32
- } -> tensor<1024x513xi32>
- flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1024, 513], strides = [1, 1] : tensor<1024x513xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x513xi32>>
- return
- }
- }
+module {
+ func.func @split_reduction_innermost_reduction_next_imperfect_tiling_supported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant dense<0> : tensor<1024x513xi32>
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x513x256xi32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x513xi32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 513, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x513x256xi32>> -> tensor<1024x513x256xi32>
+ %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x513x256xi32>) outs(%cst : tensor<1024x513xi32>) {
+ ^bb0(%in: i32, %out: i32):
+ %4 = arith.addi %in, %out : i32
+ linalg.yield %4 : i32
+ } -> tensor<1024x513xi32>
+ flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1024, 513], strides = [1, 1] : tensor<1024x513xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x513xi32>>
+ return
}
}
@@ -201,36 +158,25 @@
// CHECK: vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
// -----
-
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 1, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
-hal.executable private @split_reduction_fail1_dispatch_0 {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @split_reduction_innermost_dynamic_reduction_unsupported ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @split_reduction_innermost_dynamic_reduction_unsupported() {
- %cst = arith.constant dense<0> : tensor<1024x512xi32>
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = arith.index_castui %0 : i32 to index
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xi32>>
- %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x512x?xi32>>{%1}
- %4 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [1024, 512, %1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512x?xi32>>{%1} -> tensor<1024x512x?xi32>
- %5 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4 : tensor<1024x512x?xi32>) outs(%cst : tensor<1024x512xi32>) {
- ^bb0(%arg0: i32, %arg1: i32):
- %6 = arith.addi %arg0, %arg1 : i32
- linalg.yield %6 : i32
- } -> tensor<1024x512xi32>
- flow.dispatch.tensor.store %5, %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xi32>>
- return
- }
- }
+module {
+ func.func @split_reduction_innermost_dynamic_reduction_unsupported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant dense<0> : tensor<1024x512xi32>
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = arith.index_castui %0 : i32 to index
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xi32>>
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x512x?xi32>>{%1}
+ %4 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [1024, 512, %1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512x?xi32>>{%1} -> tensor<1024x512x?xi32>
+ %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4 : tensor<1024x512x?xi32>) outs(%cst : tensor<1024x512xi32>) {
+ ^bb0(%in: i32, %out: i32):
+ %6 = arith.addi %in, %out : i32
+ linalg.yield %6 : i32
+ } -> tensor<1024x512xi32>
+ flow.dispatch.tensor.store %5, %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xi32>>
+ return
}
}
@@ -238,34 +184,23 @@
// CHECK-4: vector.mask %{{.*}} { vector.reduction <add>
// -----
-
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
-hal.executable private @split_reduction_fail2_dispatch_0 {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @split_reduction_innermost_imperfect_reduction_unsupported ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @split_reduction_innermost_imperfect_reduction_unsupported() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant dense<0> : tensor<1024x512xi32>
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x512x257xi32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xi32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 257], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512x257xi32>> -> tensor<1024x512x257xi32>
- %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x257xi32>) outs(%cst : tensor<1024x512xi32>) {
- ^bb0(%arg0: i32, %arg1: i32):
- %4 = arith.addi %arg0, %arg1 : i32
- linalg.yield %4 : i32
- } -> tensor<1024x512xi32>
- flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xi32>>
- return
- }
- }
+module {
+ func.func @split_reduction_innermost_imperfect_reduction_unsupported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant dense<0> : tensor<1024x512xi32>
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x512x257xi32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xi32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 257], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512x257xi32>> -> tensor<1024x512x257xi32>
+ %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x257xi32>) outs(%cst : tensor<1024x512xi32>) {
+ ^bb0(%in: i32, %out: i32):
+ %4 = arith.addi %in, %out : i32
+ linalg.yield %4 : i32
+ } -> tensor<1024x512xi32>
+ flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xi32>>
+ return
}
}
@@ -273,34 +208,23 @@
// CHECK-4: vector.mask %{{.*}} { vector.reduction <add>
// -----
-
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#map0 = affine_map<(d0, d1, d2) -> (d0, d2, d1)>
+#map = affine_map<(d0, d1, d2) -> (d0, d2, d1)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
-hal.executable private @split_reduction_fail3_dispatch_0 {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @split_reduction_not_innermost_reduction_unsupported ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @split_reduction_not_innermost_reduction_unsupported() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant dense<0> : tensor<1024x256xi32>
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x512x256xi32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x256xi32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512x256xi32>> -> tensor<1024x512x256xi32>
- %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x256xi32>) outs(%cst : tensor<1024x256xi32>) {
- ^bb0(%arg0: i32, %arg1: i32):
- %4 = arith.addi %arg0, %arg1 : i32
- linalg.yield %4 : i32
- } -> tensor<1024x256xi32>
- flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1024, 256], strides = [1, 1] : tensor<1024x256xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x256xi32>>
- return
- }
- }
+module {
+ func.func @split_reduction_not_innermost_reduction_unsupported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant dense<0> : tensor<1024x256xi32>
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x512x256xi32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x256xi32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512x256xi32>> -> tensor<1024x512x256xi32>
+ %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x256xi32>) outs(%cst : tensor<1024x256xi32>) {
+ ^bb0(%in: i32, %out: i32):
+ %4 = arith.addi %in, %out : i32
+ linalg.yield %4 : i32
+ } -> tensor<1024x256xi32>
+ flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1024, 256], strides = [1, 1] : tensor<1024x256xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x256xi32>>
+ return
}
}
@@ -309,34 +233,23 @@
// CHECK-NOT: vector.reduction
// -----
-
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
-hal.executable private @split_reduction_fail4_dispatch_0 {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @split_reduction_double_reduction_unsupported ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @split_reduction_double_reduction_unsupported() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant dense<0> : tensor<1024xi32>
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x512x256xi32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024xi32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512x256xi32>> -> tensor<1024x512x256xi32>
- %3 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction", "reduction"]} ins(%2 : tensor<1024x512x256xi32>) outs(%cst : tensor<1024xi32>) {
- ^bb0(%arg0: i32, %arg1: i32):
- %4 = arith.addi %arg0, %arg1 : i32
- linalg.yield %4 : i32
- } -> tensor<1024xi32>
- flow.dispatch.tensor.store %3, %1, offsets = [0], sizes = [1024], strides = [1] : tensor<1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024xi32>>
- return
- }
- }
+module {
+ func.func @split_reduction_double_reduction_unsupported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant dense<0> : tensor<1024xi32>
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1024x512x256xi32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024xi32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512x256xi32>> -> tensor<1024x512x256xi32>
+ %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction", "reduction"]} ins(%2 : tensor<1024x512x256xi32>) outs(%cst : tensor<1024xi32>) {
+ ^bb0(%in: i32, %out: i32):
+ %4 = arith.addi %in, %out : i32
+ linalg.yield %4 : i32
+ } -> tensor<1024xi32>
+ flow.dispatch.tensor.store %3, %1, offsets = [0], sizes = [1024], strides = [1] : tensor<1024xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024xi32>>
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_ssve_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_ssve_tests.mlir
index 7bfe56c..ffa4861 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_ssve_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_ssve_tests.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy, iree-llvmcpu-lower-executable-target)))' --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --split-input-file %s | FileCheck %s
// Check Armv9 Streaming SVE mode is enabled for the following pipelines:
//
@@ -7,228 +7,106 @@
// * CPUConvTileAndDecomposeExpert
// * CPUDoubleTilingExpert
-#pipeline_layout = #hal.pipeline.layout<push_constants = 2, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {
- cpu_features = "+sve,+sme",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
-}>
-
-hal.executable private @aarch64_ssve__cpu_buffer_ops_tile_and_vectorize {
- hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64_) {
- hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
- lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [1], [0], [0]]>,
- translation_info = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize>
- } {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- hal.return %arg1, %arg2, %arg2 : index, index, index
- }
- builtin.module {
- func.func @dispatch() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %cst_0 = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1xf32>>
- %7 = tensor.empty() : tensor<1xf32>
- %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
- flow.dispatch.tensor.store %8, %6, offsets = [0], sizes = [1], strides = [1] : tensor<1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_,
+ translation_info = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize>} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1xf32>>
+ %2 = tensor.empty() : tensor<1xf32>
+ %3 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [1], [0], [0]]>}
+ ins(%cst : f32) outs(%2 : tensor<1xf32>) -> tensor<1xf32>
+ flow.dispatch.tensor.store %3, %1, offsets = [0], sizes = [1], strides = [1] : tensor<1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1xf32>>
+ return
}
}
-// CHECK-LABEL: @aarch64_ssve__cpu_buffer_ops_tile_and_vectorize
-// CHECK: func.func @dispatch() attributes {arm_locally_streaming}
+// CHECK: func.func @dispatch()
+// CHECK-SAME: arm_locally_streaming
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 2, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {
- cpu_features = "+sve,+sme",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
-}>
-
-hal.executable private @aarch64_ssve__cpu_double_tiling_peeling_expert {
- hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64_) {
- hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
- lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [1], [0], [0]]>,
- translation_info = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
- } {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- hal.return %arg1, %arg2, %arg2 : index, index, index
- }
- builtin.module {
- func.func @dispatch() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %cst_0 = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1xf32>>
- %7 = tensor.empty() : tensor<1xf32>
- %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
- flow.dispatch.tensor.store %8, %6, offsets = [0], sizes = [1], strides = [1] : tensor<1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_,
+ translation_info = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1xf32>>
+ %2 = tensor.empty() : tensor<1xf32>
+ %3 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [1], [0], [0]]>}
+ ins(%cst : f32) outs(%2 : tensor<1xf32>) -> tensor<1xf32>
+ flow.dispatch.tensor.store %3, %1, offsets = [0], sizes = [1], strides = [1] : tensor<1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1xf32>>
+ return
}
}
-// CHECK-LABEL: @aarch64_ssve__cpu_double_tiling_peeling_expert
-// CHECK: func.func @dispatch() attributes {arm_locally_streaming}
+// CHECK: func.func @dispatch()
+// CHECK-SAME: arm_locally_streaming
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 2, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {
- cpu_features = "+sve,+sme",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
-}>
-
-hal.executable private @aarch64_ssve__cpu_double_tiling_expert {
- hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64_) {
- hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
- lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [1], [0], [0]]>,
- translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>
- } {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- hal.return %arg1, %arg2, %arg2 : index, index, index
- }
- builtin.module {
- func.func @dispatch() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %cst_0 = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1xf32>>
- %7 = tensor.empty() : tensor<1xf32>
- %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
- flow.dispatch.tensor.store %8, %6, offsets = [0], sizes = [1], strides = [1] : tensor<1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_,
+ translation_info = #iree_codegen.translation_info<CPUDoubleTilingExpert>} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1xf32>>
+ %2 = tensor.empty() : tensor<1xf32>
+ %3 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [1], [0], [0]]>}
+ ins(%cst : f32) outs(%2 : tensor<1xf32>) -> tensor<1xf32>
+ flow.dispatch.tensor.store %3, %1, offsets = [0], sizes = [1], strides = [1] : tensor<1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1xf32>>
+ return
}
}
-// CHECK-LABEL: @aarch64_ssve__cpu_double_tiling_expert
-// CHECK: func.func @dispatch() attributes {arm_locally_streaming}
+// CHECK: func.func @dispatch()
+// CHECK-SAME: arm_locally_streaming
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 2, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {
- cpu_features = "+sve,+sme",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
-}>
-
-hal.executable private @aarch64_ssve__cpu_conv_tile_and_decompose_expert {
- hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64_) {
- hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
- lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [1], [0], [0]]>,
- translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
- } {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- hal.return %arg1, %arg2, %arg2 : index, index, index
- }
- builtin.module {
- func.func @dispatch() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %cst_0 = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1xf32>>
- %7 = tensor.empty() : tensor<1xf32>
- %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
- flow.dispatch.tensor.store %8, %6, offsets = [0], sizes = [1], strides = [1] : tensor<1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_,
+ translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1xf32>>
+ %2 = tensor.empty() : tensor<1xf32>
+ %3 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [1], [0], [0]]>}
+ ins(%cst : f32) outs(%2 : tensor<1xf32>) -> tensor<1xf32>
+ flow.dispatch.tensor.store %3, %1, offsets = [0], sizes = [1], strides = [1] : tensor<1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1xf32>>
+ return
}
}
-// CHECK-LABEL: @aarch64_ssve__cpu_conv_tile_and_decompose_expert
-// CHECK: func.func @dispatch() attributes {arm_locally_streaming}
+// CHECK: func.func @dispatch()
+// CHECK-SAME: arm_locally_streaming
// -----
-
-// Check Armv9 Streaming SVE mode is not enabled if +sve is not
-// specified.
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 2, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-#executable_target_embedded_elf_arm_64_no_sve = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {
- cpu_features = "+sme",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
-}>
-
-hal.executable private @aarch64_ssve_sve_disabled {
- hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64_no_sve) {
- hal.executable.export public @dispatch ordinal(0) layout(#pipeline_layout) attributes {
- lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [1], [0], [0]]>,
- translation_info = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize>
- } {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- hal.return %arg1, %arg2, %arg2 : index, index, index
- }
- builtin.module {
- func.func @dispatch() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %cst_0 = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1xf32>>
- %7 = tensor.empty() : tensor<1xf32>
- %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
- flow.dispatch.tensor.store %8, %6, offsets = [0], sizes = [1], strides = [1] : tensor<1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_,
+ translation_info = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1xf32>>
+ %2 = tensor.empty() : tensor<1xf32>
+ %3 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [1], [0], [0]]>}
+ ins(%cst : f32) outs(%2 : tensor<1xf32>) -> tensor<1xf32>
+ flow.dispatch.tensor.store %3, %1, offsets = [0], sizes = [1], strides = [1] : tensor<1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1xf32>>
+ return
}
}
-// CHECK-LABEL: @aarch64_ssve_sve_disabled
-// CHECK-NOT: func.func @dispatch() attributes {arm_locally_streaming}
+// CHECK: func.func @dispatch()
+// CHECK-NOT: arm_locally_streaming
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index 6cbedcf..1bbb0d7 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy, iree-llvmcpu-lower-executable-target)))' --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --split-input-file %s | FileCheck %s
// Check that this dispatch compiles to vectors and that there are no allocas.
// By proxy checks that destination passing style kicked in correctly
@@ -6,53 +6,35 @@
// and the conversion to destination passing style. Running CSE
// before hoists the fill and the empty out of the loop causing
// issues with the conversion.
-#map3 = affine_map<(d0) -> (d0)>
-#map4 = affine_map<(d0, d1) -> (d0)>
-#map5 = affine_map<(d0, d1) -> (d0, d1)>
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
- cpu_features = "",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "x86_64-none-elf"}>
-#pipeline_layout5 = #hal.pipeline.layout<push_constants = 2, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>]
- >]>
-hal.executable private @check_no_cse {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @check_no_cse ordinal(0) layout(#pipeline_layout5) {
- ^bb0(%arg0: !hal.device, %arg1: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @check_no_cse() {
- %cst = arith.constant 3.840000e+02 : f32
- %cst_0 = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = arith.index_cast %0 {stream.alignment = 512 : index, stream.values = [0 : index, 10752 : index]} : i32 to index
- %3 = arith.index_cast %1 {stream.alignment = 512 : index, stream.values = [10752 : index, 21504 : index]} : i32 to index
- %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%2) : !flow.dispatch.tensor<readonly:tensor<7x384xf32>>
- %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor<writeonly:tensor<7xf32>>
- %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [7, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<7x384xf32>> -> tensor<7x384xf32>
- %7 = tensor.empty() : tensor<7xf32>
- %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<7xf32>) -> tensor<7xf32>
- %9 = linalg.generic {indexing_maps = [#map5, #map4], iterator_types = ["parallel", "reduction"]} ins(%6 : tensor<7x384xf32>) outs(%8 : tensor<7xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %11 = arith.addf %arg1, %arg0 : f32
- linalg.yield %11 : f32
- } -> tensor<7xf32>
- %10 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%9 : tensor<7xf32>) outs(%7 : tensor<7xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %11 = arith.divf %arg0, %cst : f32
- linalg.yield %11 : f32
- } -> tensor<7xf32>
- flow.dispatch.tensor.store %10, %5, offsets = [0], sizes = [7], strides = [1] : tensor<7xf32> -> !flow.dispatch.tensor<writeonly:tensor<7xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+#map2 = affine_map<(d0) -> (d0)>
+module {
+ func.func @check_no_cse() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 3.840000e+02 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.index_cast %0 {stream.alignment = 512 : index, stream.values = [0 : index, 10752 : index]} : i32 to index
+ %3 = arith.index_cast %1 {stream.alignment = 512 : index, stream.values = [10752 : index, 21504 : index]} : i32 to index
+ %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%2) : !flow.dispatch.tensor<readonly:tensor<7x384xf32>>
+ %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor<writeonly:tensor<7xf32>>
+ %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [7, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<7x384xf32>> -> tensor<7x384xf32>
+ %7 = tensor.empty() : tensor<7xf32>
+ %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<7xf32>) -> tensor<7xf32>
+ %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%6 : tensor<7x384xf32>) outs(%8 : tensor<7xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %11 = arith.addf %out, %in : f32
+ linalg.yield %11 : f32
+ } -> tensor<7xf32>
+ %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel"]} ins(%9 : tensor<7xf32>) outs(%7 : tensor<7xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %11 = arith.divf %in, %cst : f32
+ linalg.yield %11 : f32
+ } -> tensor<7xf32>
+ flow.dispatch.tensor.store %10, %5, offsets = [0], sizes = [7], strides = [1] : tensor<7xf32> -> !flow.dispatch.tensor<writeonly:tensor<7xf32>>
+ return
}
}
// CHECK-LABEL: func.func @check_no_cse()
@@ -64,56 +46,37 @@
// CHECK: memref.store
// -----
-
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
- cpu_features = "",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 64 : index,
- target_triple = "x86_64-none-elf"}>
-#pipeline_layout5 = #hal.pipeline.layout<push_constants = 2, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>]
- >]>
-hal.executable private @peel_partially_unaligned_matmul {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @peel_partially_unaligned_matmul ordinal(0) layout(#pipeline_layout5) {
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @peel_partially_unaligned_matmul() {
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = arith.index_castui %0 {stream.alignment = 128 : index, stream.values = [0 : index, 131712 : index]} : i32 to index
- %5 = arith.index_castui %1 {stream.alignment = 64 : index, stream.values = [576704 : index, 1763072 : index]} : i32 to index
- %6 = arith.index_castui %2 {stream.alignment = 64 : index, stream.values = [908480 : index, 2094848 : index]} : i32 to index
- %7 = arith.index_castui %3 {stream.alignment = 128 : index, stream.values = [2304 : index, 134016 : index]} : i32 to index
- %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x576xf32>>
- %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<576x144xf32>>
- %10 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x144xf32>>
- %11 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%7) : !flow.dispatch.tensor<writeonly:tensor<1x144xf32>>
- %12 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [1, 576], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x576xf32>> -> tensor<1x576xf32>
- %13 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [576, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<576x144xf32>> -> tensor<576x144xf32>
- %14 = flow.dispatch.tensor.load %10, offsets = [0, 0], sizes = [1, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x144xf32>> -> tensor<1x144xf32>
- %15 = tensor.empty() : tensor<1x144xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1x144xf32>) -> tensor<1x144xf32>
- %17 = linalg.matmul ins(%12, %13 : tensor<1x576xf32>, tensor<576x144xf32>) outs(%16 : tensor<1x144xf32>) -> tensor<1x144xf32>
- %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<1x144xf32>, tensor<1x144xf32>) outs(%15 : tensor<1x144xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %19 = arith.addf %in, %in_0 : f32
- %20 = arith.maximumf %19, %cst : f32
- linalg.yield %20 : f32
- } -> tensor<1x144xf32>
- flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [1, 144], strides = [1, 1] : tensor<1x144xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x144xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @peel_partially_unaligned_matmul() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = arith.index_castui %0 {stream.alignment = 128 : index, stream.values = [0 : index, 131712 : index]} : i32 to index
+ %5 = arith.index_castui %1 {stream.alignment = 64 : index, stream.values = [576704 : index, 1763072 : index]} : i32 to index
+ %6 = arith.index_castui %2 {stream.alignment = 64 : index, stream.values = [908480 : index, 2094848 : index]} : i32 to index
+ %7 = arith.index_castui %3 {stream.alignment = 128 : index, stream.values = [2304 : index, 134016 : index]} : i32 to index
+ %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x576xf32>>
+ %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<576x144xf32>>
+ %10 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x144xf32>>
+ %11 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%7) : !flow.dispatch.tensor<writeonly:tensor<1x144xf32>>
+ %12 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [1, 576], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x576xf32>> -> tensor<1x576xf32>
+ %13 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [576, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<576x144xf32>> -> tensor<576x144xf32>
+ %14 = flow.dispatch.tensor.load %10, offsets = [0, 0], sizes = [1, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x144xf32>> -> tensor<1x144xf32>
+ %15 = tensor.empty() : tensor<1x144xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1x144xf32>) -> tensor<1x144xf32>
+ %17 = linalg.matmul ins(%12, %13 : tensor<1x576xf32>, tensor<576x144xf32>) outs(%16 : tensor<1x144xf32>) -> tensor<1x144xf32>
+ %18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<1x144xf32>, tensor<1x144xf32>) outs(%15 : tensor<1x144xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %19 = arith.addf %in, %in_0 : f32
+ %20 = arith.maximumf %19, %cst : f32
+ linalg.yield %20 : f32
+ } -> tensor<1x144xf32>
+ flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [1, 144], strides = [1, 1] : tensor<1x144xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x144xf32>>
+ return
}
}
// Checks that the bounded stack allocation are created.
@@ -129,94 +92,55 @@
// CHECK: arith.maximumf {{.*}} : vector<
// -----
-
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
- cpu_features = "",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "x86_64-none-elf"}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 6, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>]
- >]>
-hal.executable private @batch_matmul_dynamic {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @batch_matmul_dynamic ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index, %arg4 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @batch_matmul_dynamic() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = hal.interface.constant.load[4] : i32
- %5 = hal.interface.constant.load[5] : i32
- %6 = arith.index_cast %0 : i32 to index
- %7 = arith.index_cast %1 : i32 to index
- %8 = arith.index_cast %2 : i32 to index
- %9 = arith.index_cast %3 : i32 to index
- %10 = arith.index_cast %4 : i32 to index
- %11 = arith.index_cast %5 : i32 to index
- %12 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%6, %7, %9}
- %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%10, %11, %8}
- %14 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x?x?xf32>>{%6, %7, %8}
- %15 = flow.dispatch.tensor.load %12, offsets = [0, 0, 0], sizes = [%6, %7, %9], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%6, %7, %9} -> tensor<?x?x?xf32>
- %16 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [%10, %11, %8], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%10, %11, %8} -> tensor<?x?x?xf32>
- %17 = tensor.empty(%6, %7, %8) : tensor<?x?x?xf32>
- %18 = linalg.fill ins(%cst : f32) outs(%17 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
- %19 = linalg.batch_matmul ins(%15, %16 : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%18 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
- flow.dispatch.tensor.store %19, %14, offsets = [0, 0, 0], sizes = [%6, %7, %8], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?xf32>>{%6, %7, %8}
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @batch_matmul_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = hal.interface.constant.load[4] : i32
+ %5 = hal.interface.constant.load[5] : i32
+ %6 = arith.index_cast %0 : i32 to index
+ %7 = arith.index_cast %1 : i32 to index
+ %8 = arith.index_cast %2 : i32 to index
+ %9 = arith.index_cast %3 : i32 to index
+ %10 = arith.index_cast %4 : i32 to index
+ %11 = arith.index_cast %5 : i32 to index
+ %12 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%6, %7, %9}
+ %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%10, %11, %8}
+ %14 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x?x?xf32>>{%6, %7, %8}
+ %15 = flow.dispatch.tensor.load %12, offsets = [0, 0, 0], sizes = [%6, %7, %9], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%6, %7, %9} -> tensor<?x?x?xf32>
+ %16 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [%10, %11, %8], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%10, %11, %8} -> tensor<?x?x?xf32>
+ %17 = tensor.empty(%6, %7, %8) : tensor<?x?x?xf32>
+ %18 = linalg.fill ins(%cst : f32) outs(%17 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+ %19 = linalg.batch_matmul ins(%15, %16 : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%18 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+ flow.dispatch.tensor.store %19, %14, offsets = [0, 0, 0], sizes = [%6, %7, %8], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?xf32>>{%6, %7, %8}
+ return
}
}
// CHECK-LABEL: func.func @batch_matmul_dynamic
// CHECK: vector.fma
// -----
-
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
- cpu_features = "",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "x86_64-none-elf"}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 2, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>]
- >]>
-hal.executable private @check_buffer_ops_vectorization {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @check_buffer_ops_vectorization ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d0 * 1536 + d1)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @check_buffer_ops_vectorization() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x1024xi32>
+ memref.assume_alignment %0, 64 : memref<128x1024xi32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128x1536xi32>
+ memref.assume_alignment %1, 64 : memref<128x1536xi32>
+ %subview = memref.subview %1[0, 0] [128, 1024] [1, 1] : memref<128x1536xi32> to memref<128x1024xi32, #map>
+ linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%0 : memref<128x1024xi32>) outs(%subview : memref<128x1024xi32, #map>) {
+ ^bb0(%in: i32, %out: i32):
+ linalg.yield %in : i32
}
- builtin.module {
- func.func @check_buffer_ops_vectorization() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x1024xi32>
- memref.assume_alignment %0, 64 : memref<128x1024xi32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128x1536xi32>
- memref.assume_alignment %1, 64 : memref<128x1536xi32>
- %2 = memref.subview %1[0, 0] [128, 1024] [1, 1] : memref<128x1536xi32> to memref<128x1024xi32, affine_map<(d0, d1) -> (d0 * 1536 + d1)>>
- linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]}
- ins(%0 : memref<128x1024xi32>)
- outs(%2 : memref<128x1024xi32, affine_map<(d0, d1) -> (d0 * 1536 + d1)>>) {
- ^bb0(%arg0: i32, %arg1: i32):
- linalg.yield %arg0 : i32
- }
- return
- }
- }
+ return
}
}
// CHECK-LABEL: #{{.+}} = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize
@@ -225,65 +149,39 @@
// CHECK: vector.store
// -----
-
-hal.executable private @vectorize_fill_conv2d_generic {
- hal.executable.variant public @embedded_elf_x86_64 target(#hal.executable.target<
- "llvm-cpu", "embedded-elf-x86_64", {
- cpu_features = "",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "x86_64-none-elf"
- }>) {
- hal.executable.export public @vectorize_fill_conv2d_generic ordinal(0) layout(
- #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>]
- >]>
- ) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index, %arg4 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @vectorize_fill_conv2d_generic() {
- %cst = arith.constant 0.000000e+00 : f32
- %cst_0 = arith.constant 3.000000e+00 : f32
- %cst_1 = arith.constant 6.000000e+00 : f32
- %cst_2 = arith.constant 0.166666672 : f32
- %cst_3 = arith.constant dense<0.0> : tensor<16xf32>
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>> -> tensor<1x225x225x3xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>> -> tensor<3x3x3x16xf32>
- %5 = tensor.empty() : tensor<1x112x112x16xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
- %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
- %8 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2, d3) -> (d3)>,
- affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
- iterator_types = ["parallel", "parallel", "parallel", "parallel"
- ]} ins(%cst_3, %7 : tensor<16xf32>, tensor<1x112x112x16xf32>) outs(%5 : tensor<1x112x112x16xf32>) {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
- %9 = arith.addf %arg0, %arg1 : f32
- %10 = arith.addf %9, %cst_0 : f32
- %11 = arith.cmpf olt, %10, %cst : f32
- %12 = arith.select %11, %cst, %10 : f32
- %13 = arith.cmpf olt, %cst_1, %10 : f32
- %14 = arith.select %13, %cst_1, %12 : f32
- %15 = arith.mulf %9, %14 : f32
- %16 = arith.mulf %15, %cst_2 : f32
- linalg.yield %16 : f32
- } -> tensor<1x112x112x16xf32>
- flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 16], strides = [1, 1, 1, 1] : tensor<1x112x112x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1, d2, d3) -> (d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+module {
+ func.func @vectorize_fill_conv2d_generic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %cst_0 = arith.constant 3.000000e+00 : f32
+ %cst_1 = arith.constant 6.000000e+00 : f32
+ %cst_2 = arith.constant 0.166666672 : f32
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<16xf32>
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>> -> tensor<1x225x225x3xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>> -> tensor<3x3x3x16xf32>
+ %5 = tensor.empty() : tensor<1x112x112x16xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
+ %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
+ %8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_3, %7 : tensor<16xf32>, tensor<1x112x112x16xf32>) outs(%5 : tensor<1x112x112x16xf32>) {
+ ^bb0(%in: f32, %in_4: f32, %out: f32):
+ %9 = arith.addf %in, %in_4 : f32
+ %10 = arith.addf %9, %cst_0 : f32
+ %11 = arith.cmpf olt, %10, %cst : f32
+ %12 = arith.select %11, %cst, %10 : f32
+ %13 = arith.cmpf olt, %cst_1, %10 : f32
+ %14 = arith.select %13, %cst_1, %12 : f32
+ %15 = arith.mulf %9, %14 : f32
+ %16 = arith.mulf %15, %cst_2 : f32
+ linalg.yield %16 : f32
+ } -> tensor<1x112x112x16xf32>
+ flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 16], strides = [1, 1, 1, 1] : tensor<1x112x112x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
+ return
}
}
@@ -295,44 +193,34 @@
// CHECK: arith.cmpf olt, %{{.+}}, %{{.+}} : vector<4x4xf32>
// -----
-
-hal.executable private @multi_result {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>) {
- hal.executable.export public @multi_result ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @multi_result() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %cst_0 = arith.constant 1.000000e-03 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<writeonly:tensor<128x256xf32>>
- %5 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
- %6 = hal.interface.binding.subspan set(0) binding(5) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
- %7 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
- %8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
- %9 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
- %13 = tensor.empty() : tensor<64x256xf32>
- %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor<64x256xf32>) -> tensor<64x256xf32>
- %15 = linalg.matmul ins(%7, %8 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%14 : tensor<64x256xf32>) -> tensor<64x256xf32>
- %16 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%15, %9 : tensor<64x256xf32>, tensor<256xf32>) outs(%13 : tensor<64x256xf32>) {
- ^bb0(%in: f32, %in_1: f32, %out: f32):
- %17 = arith.addf %in, %in_1 : f32
- linalg.yield %17 : f32
- } -> tensor<64x256xf32>
- flow.dispatch.tensor.store %15, %5, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
- flow.dispatch.tensor.store %16, %6, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d1)>
+module {
+ func.func @multi_result() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %cst_0 = arith.constant 1.000000e-03 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x128xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<writeonly:tensor<128x256xf32>>
+ %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
+ %5 = hal.interface.binding.subspan set(0) binding(5) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
+ %6 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x128xf32>> -> tensor<64x128xf32>
+ %7 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
+ %8 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
+ %9 = tensor.empty() : tensor<64x256xf32>
+ %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32>
+ %11 = linalg.matmul ins(%6, %7 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%10 : tensor<64x256xf32>) -> tensor<64x256xf32>
+ %12 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%11, %8 : tensor<64x256xf32>, tensor<256xf32>) outs(%9 : tensor<64x256xf32>) {
+ ^bb0(%in: f32, %in_1: f32, %out: f32):
+ %13 = arith.addf %in, %in_1 : f32
+ linalg.yield %13 : f32
+ } -> tensor<64x256xf32>
+ flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
+ flow.dispatch.tensor.store %12, %5, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x256xf32>>
+ return
}
}
// CHECK-LABEL: func @multi_result
@@ -344,31 +232,19 @@
// -----
-hal.executable private @mmt4d_ukernel {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64",
- {cpu = "generic", cpu_features = "",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index, target_triple = "x86_64-none-elf",
- ukernels = "mmt4d"}>) {
- hal.executable.export public @ukernel_dispatch ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @ukernel_dispatch() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x4x8x32xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x4x16x32xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<2x16x8x16xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 4, 8, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x4x8x32xf32>> -> tensor<2x4x8x32xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [16, 4, 16, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x4x16x32xf32>> -> tensor<16x4x16x32xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 16, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x16x8x16xf32>> -> tensor<2x16x8x16xf32>
- %6 = linalg.mmt4d ins(%3, %4 : tensor<2x4x8x32xf32>, tensor<16x4x16x32xf32>) outs(%5 : tensor<2x16x8x16xf32>) -> tensor<2x16x8x16xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 16, 8, 16], strides = [1, 1, 1, 1] : tensor<2x16x8x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<2x16x8x16xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf", ukernels = "mmt4d"}>
+module {
+ func.func @ukernel_dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x4x8x32xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x4x16x32xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<2x16x8x16xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 4, 8, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x4x8x32xf32>> -> tensor<2x4x8x32xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [16, 4, 16, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x4x16x32xf32>> -> tensor<16x4x16x32xf32>
+ %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 16, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x16x8x16xf32>> -> tensor<2x16x8x16xf32>
+ %6 = linalg.mmt4d ins(%3, %4 : tensor<2x4x8x32xf32>, tensor<16x4x16x32xf32>) outs(%5 : tensor<2x16x8x16xf32>) -> tensor<2x16x8x16xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 16, 8, 16], strides = [1, 1, 1, 1] : tensor<2x16x8x16xf32> -> !flow.dispatch.tensor<readwrite:tensor<2x16x8x16xf32>>
+ return
}
}
// CHECK-LABEL: func @ukernel_dispatch()
@@ -379,51 +255,31 @@
// CHECK: iree_codegen.ukernel.generic "iree_uk_mmt4d"
// -----
-
-hal.executable private @ukernel_pass_through {
- hal.executable.variant public @embedded_elf_x86_64 target(<
- "llvm-cpu", "embedded-elf-x86_64", {
- cpu = "generic", cpu_features = "",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index, target_triple = "x86_64-none-elf",
- ukernels = "all"}>) {
- hal.executable.export public @dispatch ordinal(0) layout(#hal.pipeline.layout<
- push_constants = 2, sets = [
- <0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>,
- <2, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<CPUDefault>} {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- hal.return %arg1, %arg2, %arg2 : index, index, index
- }
- builtin.module {
- func.func @dispatch() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = arith.index_castui %0 : i32 to index
- %3 = arith.index_castui %1 : i32 to index
- %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%2}
- %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%3}
- %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<?xf32>>{%2}
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %7 = affine.min affine_map<()[s0, s1, s2] -> (s0 - s1 * (s0 ceildiv s2), s0 ceildiv s2)>()[%2, %workgroup_id_x, %workgroup_count_x]
- %8 = affine.apply affine_map<()[s0, s1, s2] -> (s0 * (s1 ceildiv s2))>()[%workgroup_id_x, %2, %workgroup_count_x]
- %9 = flow.dispatch.tensor.load %4, offsets = [%8], sizes = [%7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%2} -> tensor<?xf32>
- %10 = flow.dispatch.tensor.load %5, offsets = [%8], sizes = [%7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%3} -> tensor<?xf32>
- %11 = tensor.empty(%7) : tensor<?xf32>
- %12 = iree_codegen.ukernel.generic "simple_mul_workgroup" ins(%9, %10 : tensor<?xf32>, tensor<?xf32>) outs(%11 : tensor<?xf32>) (%7 : index) -> tensor<?xf32>
- flow.dispatch.tensor.store %12, %6, offsets = [%8], sizes = [%7], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<readwrite:tensor<?xf32>>{%2}
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf", ukernels = "all"}>
+#map = affine_map<()[s0, s1, s2] -> (s0 - s1 * (s0 ceildiv s2), s0 ceildiv s2)>
+#map1 = affine_map<()[s0, s1, s2] -> (s0 * (s1 ceildiv s2))>
+module {
+ func.func @dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.index_castui %0 : i32 to index
+ %3 = arith.index_castui %1 : i32 to index
+ %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%2}
+ %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%3}
+ %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<?xf32>>{%2}
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %7 = affine.min #map()[%2, %workgroup_id_x, %workgroup_count_x]
+ %8 = affine.apply #map1()[%workgroup_id_x, %2, %workgroup_count_x]
+ %9 = flow.dispatch.tensor.load %4, offsets = [%8], sizes = [%7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%2} -> tensor<?xf32>
+ %10 = flow.dispatch.tensor.load %5, offsets = [%8], sizes = [%7], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%3} -> tensor<?xf32>
+ %11 = tensor.empty(%7) : tensor<?xf32>
+ %12 = iree_codegen.ukernel.generic "simple_mul_workgroup" ins(%9, %10 : tensor<?xf32>, tensor<?xf32>) outs(%11 : tensor<?xf32>) (%7 : index) -> tensor<?xf32>
+ flow.dispatch.tensor.store %12, %6, offsets = [%8], sizes = [%7], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<readwrite:tensor<?xf32>>{%2}
+ return
}
}
-// CHECK-LABEL: hal.executable private @ukernel_pass_through
-// CHECK: hal.executable.export public @dispatch
-// CHECK-NEXT: %[[ARG1:[a-zA-Z0-9]+]]: index
-// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index
-// CHECK: hal.return %[[ARG1]], %[[ARG2]], %[[ARG2]]
// CHECK: func @dispatch
// CHECK: %[[INPUT0:.+]] = hal.interface.binding.subspan set(0) binding(0)
// CHECK-SAME: memref<?xf32, #hal.descriptor_type<storage_buffer>>
@@ -441,41 +297,25 @@
// CHECK-SAME: outs(%[[SUBVIEW_OUTPUT]]
// -----
-
-hal.executable private @unsupported_ukernel_fallback_to_vectorization {
- hal.executable.variant public @embedded_elf_x86_64 target(<
- "llvm-cpu", "embedded-elf-x86_64", {
- cpu = "generic",
- cpu_features = "+fma,+avx512f",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128",
- native_vector_size = 64 : index,
- target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
- ) {
- hal.executable.export public @unsupported_ukernel_fallback_to_vectorization ordinal(0)
- layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>)
- attributes {translation_info = #iree_codegen.translation_info<Mmt4dTilingExpert>} {
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @unsupported_ukernel_fallback_to_vectorization() {
- %c0 = arith.constant 0 : index
- %c1024 = arith.constant 1024 : index
- %c132096 = arith.constant 132096 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x256x1x1xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c1024) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4x256x128x1xi8>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c132096) : !flow.dispatch.tensor<writeonly:tensor<1x4x1x128xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 256, 1, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x256x1x1xf32>> -> tensor<1x256x1x1xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [4, 256, 128, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x256x128x1xi8>> -> tensor<4x256x128x1xi8>
- %5 = tensor.empty() : tensor<1x4x1x128xf32>
- %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 0, 0], [1, 1, 0, 1], [0, 0, 0, 0], [0, 0, 0, 0]]>} ins(%cst : f32) outs(%5 : tensor<1x4x1x128xf32>) -> tensor<1x4x1x128xf32>
- %7 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 0, 1, 128, 0], [0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0]]>} ins(%3, %4 : tensor<1x256x1x1xf32>, tensor<4x256x128x1xi8>) outs(%6 : tensor<1x4x1x128xf32>) -> tensor<1x4x1x128xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 4, 1, 128], strides = [1, 1, 1, 1] : tensor<1x4x1x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x4x1x128xf32>>
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 0, 0], [1, 1, 0, 1], [0, 0, 0, 0], [0, 0, 0, 0]]>
+#config1 = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 0, 0, 0, 0], [1, 1, 0, 1, 128, 0], [0, 0, 1, 0, 0, 1]]>
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+fma,+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
+module {
+ func.func @unsupported_ukernel_fallback_to_vectorization() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #iree_codegen.translation_info<Mmt4dTilingExpert>} {
+ %c0 = arith.constant 0 : index
+ %c1024 = arith.constant 1024 : index
+ %c132096 = arith.constant 132096 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x256x1x1xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c1024) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4x256x128x1xi8>>
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c132096) : !flow.dispatch.tensor<writeonly:tensor<1x4x1x128xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 256, 1, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x256x1x1xf32>> -> tensor<1x256x1x1xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [4, 256, 128, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x256x128x1xi8>> -> tensor<4x256x128x1xi8>
+ %5 = tensor.empty() : tensor<1x4x1x128xf32>
+ %6 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%5 : tensor<1x4x1x128xf32>) -> tensor<1x4x1x128xf32>
+ %7 = linalg.mmt4d {lowering_config = #config1} ins(%3, %4 : tensor<1x256x1x1xf32>, tensor<4x256x128x1xi8>) outs(%6 : tensor<1x4x1x128xf32>) -> tensor<1x4x1x128xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 4, 1, 128], strides = [1, 1, 1, 1] : tensor<1x4x1x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x4x1x128xf32>>
+ return
}
}
// CHECK-LABEL: func.func @unsupported_ukernel_fallback_to_vectorization
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_transpose_avx2_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_transpose_avx2_tests.mlir
index 5ac00ba..f9692d1 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_transpose_avx2_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_transpose_avx2_tests.mlir
@@ -1,47 +1,22 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy, iree-llvmcpu-lower-executable-target)))' --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --split-input-file %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
- "llvm-cpu",
- "embedded-elf-x86_64", {
- cpu_features = "+avx2",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 32 : index,
- target_triple = "x86_64-none-elf"
- }>
-
-
-hal.executable private @transpose_10_8x8_pattern {
- hal.executable.variant @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export @transpose_10_8x8_pattern layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @transpose_10_8x8_pattern() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>> -> tensor<512x1024xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
- %6 = linalg.generic {
- indexing_maps = [ affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%3 : tensor<512x1024xf32>) outs(%5 : tensor<1024x512xf32>) {
- ^bb0(%arg1: f32, %arg2: f32):
- linalg.yield %arg1 : f32
- } -> tensor<1024x512xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d1, d0)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @transpose_10_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>> -> tensor<512x1024xf32>
+ %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<512x1024xf32>) outs(%3 : tensor<1024x512xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1024x512xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
+ return
}
}
@@ -57,49 +32,23 @@
// CHECK-COUNT-8: vector.store
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
- "llvm-cpu",
- "embedded-elf-x86_64", {
- cpu_features = "+avx2",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 32 : index,
- target_triple = "x86_64-none-elf"
- }>
-
-
-hal.executable private @transpose_021_8x8_pattern {
- hal.executable.variant @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export @transpose_021_8x8_pattern layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @transpose_021_8x8_pattern() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x128x96xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>> -> tensor<64x96x128xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [64, 128, 96], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x128x96xf32>> -> tensor<64x128x96xf32>
- %6 = linalg.generic {
- indexing_maps = [ affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%3 : tensor<64x96x128xf32>) outs(%5 : tensor<64x128x96xf32>) {
- ^bb0(%arg1: f32, %arg2: f32):
- linalg.yield %arg1 : f32
- } -> tensor<64x128x96xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0], sizes = [64, 128, 96], strides = [1, 1, 1] : tensor<64x128x96xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x128x96xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1, d2) -> (d0, d2, d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+module {
+ func.func @transpose_021_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x128x96xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>> -> tensor<64x96x128xf32>
+ %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 128, 96], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<64x128x96xf32>> -> tensor<64x128x96xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<64x128x96xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<64x128x96xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [64, 128, 96], strides = [1, 1, 1] : tensor<64x128x96xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x128x96xf32>>
+ return
}
}
@@ -115,49 +64,23 @@
// CHECK-COUNT-8: vector.store
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
- "llvm-cpu",
- "embedded-elf-x86_64", {
- cpu_features = "+avx2",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 32 : index,
- target_triple = "x86_64-none-elf"
- }>
-
-
-hal.executable private @transpose_201_8x8_pattern {
- hal.executable.variant @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export @transpose_201_8x8_pattern layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @transpose_201_8x8_pattern() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x64x96xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>> -> tensor<64x96x128xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [128, 64, 96], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<128x64x96xf32>> -> tensor<128x64x96xf32>
- %6 = linalg.generic {
- indexing_maps = [ affine_map<(d0, d1, d2) -> (d1, d2, d0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%3 : tensor<64x96x128xf32>) outs(%5 : tensor<128x64x96xf32>) {
- ^bb0(%arg1: f32, %arg2: f32):
- linalg.yield %arg1 : f32
- } -> tensor<128x64x96xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0], sizes = [128, 64, 96], strides = [1, 1, 1] : tensor<128x64x96xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x64x96xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1, d2) -> (d1, d2, d0)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+module {
+ func.func @transpose_201_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x64x96xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>> -> tensor<64x96x128xf32>
+ %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [128, 64, 96], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<128x64x96xf32>> -> tensor<128x64x96xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<128x64x96xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<128x64x96xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [128, 64, 96], strides = [1, 1, 1] : tensor<128x64x96xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x64x96xf32>>
+ return
}
}
@@ -173,49 +96,23 @@
// CHECK-COUNT-8: vector.store
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
- "llvm-cpu",
- "embedded-elf-x86_64", {
- cpu_features = "+avx2",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 32 : index,
- target_triple = "x86_64-none-elf"
- }>
-
-
-hal.executable private @transpose_210_8x8_pattern {
- hal.executable.variant @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export @transpose_210_8x8_pattern layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @transpose_210_8x8_pattern() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x96x64xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>> -> tensor<64x96x128xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [128, 96, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<128x96x64xf32>> -> tensor<128x96x64xf32>
- %6 = linalg.generic {
- indexing_maps = [ affine_map<(d0, d1, d2) -> (d2, d1, d0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%3 : tensor<64x96x128xf32>) outs(%5 : tensor<128x96x64xf32>) {
- ^bb0(%arg1: f32, %arg2: f32):
- linalg.yield %arg1 : f32
- } -> tensor<128x96x64xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0], sizes = [128, 96, 64], strides = [1, 1, 1] : tensor<128x96x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x96x64xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1, d2) -> (d2, d1, d0)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+module {
+ func.func @transpose_210_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x96x64xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>> -> tensor<64x96x128xf32>
+ %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [128, 96, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<128x96x64xf32>> -> tensor<128x96x64xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<128x96x64xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<128x96x64xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [128, 96, 64], strides = [1, 1, 1] : tensor<128x96x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x96x64xf32>>
+ return
}
}
@@ -231,49 +128,23 @@
// CHECK-COUNT-8: vector.store
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
- "llvm-cpu",
- "embedded-elf-x86_64", {
- cpu_features = "+avx2",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 32 : index,
- target_triple = "x86_64-none-elf"
- }>
-
-
-hal.executable private @transpose_120_8x8_pattern {
- hal.executable.variant @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export @transpose_120_8x8_pattern layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @transpose_120_8x8_pattern() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<96x128x64xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>> -> tensor<64x96x128xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [96, 128, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<96x128x64xf32>> -> tensor<96x128x64xf32>
- %6 = linalg.generic {
- indexing_maps = [ affine_map<(d0, d1, d2) -> (d2, d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%3 : tensor<64x96x128xf32>) outs(%5 : tensor<96x128x64xf32>) {
- ^bb0(%arg1: f32, %arg2: f32):
- linalg.yield %arg1 : f32
- } -> tensor<96x128x64xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0], sizes = [96, 128, 64], strides = [1, 1, 1] : tensor<96x128x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<96x128x64xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1, d2) -> (d2, d0, d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+module {
+ func.func @transpose_120_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<96x128x64xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>> -> tensor<64x96x128xf32>
+ %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [96, 128, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<96x128x64xf32>> -> tensor<96x128x64xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<96x128x64xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<96x128x64xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [96, 128, 64], strides = [1, 1, 1] : tensor<96x128x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<96x128x64xf32>>
+ return
}
}
@@ -289,49 +160,23 @@
// CHECK-COUNT-8: vector.store
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
- "llvm-cpu",
- "embedded-elf-x86_64", {
- cpu_features = "+avx2",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 32 : index,
- target_triple = "x86_64-none-elf"
- }>
-
-
-hal.executable private @transpose_102 {
- hal.executable.variant @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export @transpose_102 layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @transpose_102() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<96x64x128xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>> -> tensor<64x96x128xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [96, 64, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<96x64x128xf32>> -> tensor<96x64x128xf32>
- %6 = linalg.generic {
- indexing_maps = [ affine_map<(d0, d1, d2) -> (d1, d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%3 : tensor<64x96x128xf32>) outs(%5 : tensor<96x64x128xf32>) {
- ^bb0(%arg1: f32, %arg2: f32):
- linalg.yield %arg1 : f32
- } -> tensor<96x64x128xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0], sizes = [96, 64, 128], strides = [1, 1, 1] : tensor<96x64x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<96x64x128xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+module {
+ func.func @transpose_102() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<96x64x128xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x96x128xf32>> -> tensor<64x96x128xf32>
+ %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [96, 64, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<96x64x128xf32>> -> tensor<96x64x128xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<96x64x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<96x64x128xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [96, 64, 128], strides = [1, 1, 1] : tensor<96x64x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<96x64x128xf32>>
+ return
}
}
@@ -339,49 +184,23 @@
// CHECK-NOT: vector.shuffle %{{.*}}, %{{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32>
// CHECK-NOT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" %{{.*}}, %{{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32>
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
- "llvm-cpu",
- "embedded-elf-x86_64", {
- // No '+avx2' cpu feature.
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 32 : index,
- target_triple = "x86_64-none-elf"
- }>
-
-
-hal.executable private @test_no_avx2_feature {
- hal.executable.variant @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export @test_no_avx2_feature layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @test_no_avx2_feature() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>> -> tensor<512x1024xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
- %6 = linalg.generic {
- indexing_maps = [ affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%3 : tensor<512x1024xf32>) outs(%5 : tensor<1024x512xf32>) {
- ^bb0(%arg1: f32, %arg2: f32):
- linalg.yield %arg1 : f32
- } -> tensor<1024x512xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d1, d0)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @test_no_avx2_feature() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>> -> tensor<512x1024xf32>
+ %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<512x1024xf32>) outs(%3 : tensor<1024x512xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1024x512xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vector_masking_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vector_masking_tests.mlir
index 9da0a97..9927fa7 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vector_masking_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vector_masking_tests.mlir
@@ -1,57 +1,28 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy, iree-llvmcpu-lower-executable-target)))' -split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' -split-input-file %s | FileCheck %s
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @preset_config_generic_add {
- hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 32 : index,
- target_triple = "x86_64-unknown-linux-gnu"
- }>) {
- hal.executable.export @mask_dynamic_generic_add layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @mask_dynamic_generic_add() {
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %6 = arith.index_cast %0 : i32 to index
- %7 = arith.index_cast %1 : i32 to index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%6, %7}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
- %init = tensor.empty(%6, %7) : tensor<?x?xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
- %generic = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill : tensor<?x?xf32>) {
- ^bb0(%in0: f32, %in1: f32, %out: f32):
- %add = arith.addf %in0, %in1 : f32
- linalg.yield %add: f32
- } -> tensor<?x?xf32>
- flow.dispatch.tensor.store %generic, %result_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
- : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%6, %7}
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @mask_dynamic_generic_add() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.index_cast %0 : i32 to index
+ %3 = arith.index_cast %1 : i32 to index
+ %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3}
+ %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3}
+ %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
+ %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3} -> tensor<?x?xf32>
+ %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3} -> tensor<?x?xf32>
+ %9 = tensor.empty(%2, %3) : tensor<?x?xf32>
+ %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%7, %8 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %12 = arith.addf %in, %in_0 : f32
+ linalg.yield %12 : f32
+ } -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %11, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
+ return
}
}
@@ -66,52 +37,28 @@
// CHECK-NOT: scf.for
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @preset_config_reduction {
- hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 32 : index,
- target_triple = "x86_64-unknown-linux-gnu"
- }>) {
- hal.executable.export @mask_dynamic_reduction layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @mask_dynamic_reduction() {
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %6 = arith.index_cast %0 : i32 to index
- %7 = arith.index_cast %1 : i32 to index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
- %result_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%6}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
- %init = tensor.empty(%6) : tensor<?xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?xf32>) -> tensor<?xf32>
- %generic = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d0)>],
- iterator_types = ["parallel", "reduction"]}
- ins(%lhs : tensor<?x?xf32>) outs(%fill : tensor<?xf32>) {
- ^bb0(%in0: f32, %out: f32):
- %add = arith.addf %out, %in0 : f32
- linalg.yield %add: f32
- } -> tensor<?xf32>
- flow.dispatch.tensor.store %generic, %result_binding, offsets = [0], sizes = [%6], strides = [1]
- : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%6}
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+module {
+ func.func @mask_dynamic_reduction() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.index_cast %0 : i32 to index
+ %3 = arith.index_cast %1 : i32 to index
+ %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3}
+ %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%2}
+ %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3} -> tensor<?x?xf32>
+ %7 = tensor.empty(%2) : tensor<?xf32>
+ %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<?xf32>) -> tensor<?xf32>
+ %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%6 : tensor<?x?xf32>) outs(%8 : tensor<?xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %10 = arith.addf %out, %in : f32
+ linalg.yield %10 : f32
+ } -> tensor<?xf32>
+ flow.dispatch.tensor.store %9, %5, offsets = [0], sizes = [%2], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%2}
+ return
}
}
@@ -120,58 +67,29 @@
// CHECK: vector.mask %{{.*}} { vector.reduction <add>
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @preset_config_generic_add {
- hal.executable.variant @embedded_elf_rv32 target(<"llvm-cpu", "embedded-elf-riscv_32", {
- data_layout = "e-m:e-p:32:32-i64:64-n32-S128",
- native_vector_size = 32 : index,
- target_triple = "riscv32-none-elf"
- }>) {
- hal.executable.export @mask_dynamic_generic_add layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @mask_dynamic_generic_add() {
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %6 = arith.index_cast %0 : i32 to index
- %7 = arith.index_cast %1 : i32 to index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%6, %7}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
- %init = tensor.empty(%6, %7) : tensor<?x?xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
- %generic = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill : tensor<?x?xf32>) {
- ^bb0(%in0: f32, %in1: f32, %out: f32):
- %add = arith.addf %in0, %in1 : f32
- linalg.yield %add: f32
- } -> tensor<?x?xf32>
- flow.dispatch.tensor.store %generic, %result_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
- : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%6, %7}
- return
- }
- }
+#executable_target_embedded_elf_riscv_32_ = #hal.executable.target<"llvm-cpu", "embedded-elf-riscv_32", {data_layout = "e-m:e-p:32:32-i64:64-n32-S128", native_vector_size = 32 : index, target_triple = "riscv32-none-elf"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @mask_dynamic_generic_add() attributes {hal.executable.target = #executable_target_embedded_elf_riscv_32_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.index_cast %0 : i32 to index
+ %3 = arith.index_cast %1 : i32 to index
+ %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3}
+ %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3}
+ %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
+ %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3} -> tensor<?x?xf32>
+ %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3} -> tensor<?x?xf32>
+ %9 = tensor.empty(%2, %3) : tensor<?x?xf32>
+ %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%7, %8 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %12 = arith.addf %in, %in_0 : f32
+ linalg.yield %12 : f32
+ } -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %11, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
+ return
}
}
@@ -186,58 +104,29 @@
// CHECK-NOT: scf.for
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @preset_config_generic_add {
- hal.executable.variant @embedded_elf_rv32 target(<"llvm-cpu", "embedded-elf-arm_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
- }>) {
- hal.executable.export @mask_dynamic_generic_add layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @mask_dynamic_generic_add() {
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %6 = arith.index_cast %0 : i32 to index
- %7 = arith.index_cast %1 : i32 to index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%6, %7}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
- %init = tensor.empty(%6, %7) : tensor<?x?xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
- %generic = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill : tensor<?x?xf32>) {
- ^bb0(%in0: f32, %in1: f32, %out: f32):
- %add = arith.addf %in0, %in1 : f32
- linalg.yield %add: f32
- } -> tensor<?x?xf32>
- flow.dispatch.tensor.store %generic, %result_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
- : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%6, %7}
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @mask_dynamic_generic_add() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.index_cast %0 : i32 to index
+ %3 = arith.index_cast %1 : i32 to index
+ %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3}
+ %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3}
+ %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
+ %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3} -> tensor<?x?xf32>
+ %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3} -> tensor<?x?xf32>
+ %9 = tensor.empty(%2, %3) : tensor<?x?xf32>
+ %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%7, %8 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %12 = arith.addf %in, %in_0 : f32
+ linalg.yield %12 : f32
+ } -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %11, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
+ return
}
}
@@ -247,54 +136,24 @@
// CHECK-NOT: vector.maskedload
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable private @preset_config_matmul {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- cpu_features = "+sve",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
- }>) {
- hal.executable.export @mask_matmul_sve layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @mask_matmul_sve() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %M = hal.interface.constant.load[0] : index
- %N = hal.interface.constant.load[1] : index
- %K = hal.interface.constant.load[2] : index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %K}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%K, %N}
- %init_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %N}
- %result_binding = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%M, %N}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %K} -> tensor<?x?xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%K, %N} -> tensor<?x?xf32>
- %init = flow.dispatch.tensor.load %init_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %N} -> tensor<?x?xf32>
- %gemm = linalg.matmul ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
- : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%M, %N}
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @mask_matmul_sve() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
+ %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
+ %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
+ %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<?x?xf32>
+ %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
+ %10 = linalg.matmul ins(%7, %8 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ return
}
}
@@ -304,59 +163,29 @@
// CHECK: vector.maskedload
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @preset_config_generic_add {
- hal.executable.variant @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {
- cpu_features = "+sve",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
- }>) {
- hal.executable.export @mask_dynamic_generic_add layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @mask_dynamic_generic_add() {
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %6 = arith.index_cast %0 : i32 to index
- %7 = arith.index_cast %1 : i32 to index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%6, %7}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
- %init = tensor.empty(%6, %7) : tensor<?x?xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
- %generic = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill : tensor<?x?xf32>) {
- ^bb0(%in0: f32, %in1: f32, %out: f32):
- %add = arith.addf %in0, %in1 : f32
- linalg.yield %add: f32
- } -> tensor<?x?xf32>
- flow.dispatch.tensor.store %generic, %result_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
- : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%6, %7}
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @mask_dynamic_generic_add() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.index_cast %0 : i32 to index
+ %3 = arith.index_cast %1 : i32 to index
+ %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3}
+ %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3}
+ %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
+ %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3} -> tensor<?x?xf32>
+ %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %3} -> tensor<?x?xf32>
+ %9 = tensor.empty(%2, %3) : tensor<?x?xf32>
+ %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%7, %8 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%10 : tensor<?x?xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %12 = arith.addf %in, %in_0 : f32
+ linalg.yield %12 : f32
+ } -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %11, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%2, %3}
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vectorize_nd_extract_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vectorize_nd_extract_tests.mlir
index e2bf67e..e25e6c0 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vectorize_nd_extract_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vectorize_nd_extract_tests.mlir
@@ -1,93 +1,87 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy, iree-llvmcpu-lower-executable-target)))' --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --split-input-file %s | FileCheck %s
-hal.executable private @main_dispatch_77 {
- hal.executable.variant public @system_elf_riscv_64 target(<"llvm-cpu", "system-elf-riscv_64", {cpu = "generic-rv64", cpu_features = "+m,+a,+f,+d,+v", data_layout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128", native_vector_size = 64 : index, target_triple = "riscv64"}>) {
- hal.executable.export public @main_dispatch_77_generic_1x257x257x21 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @main_dispatch_77_generic_1x257x257x21() {
- %c1115136 = arith.constant 1115136 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 2.000000e+00 : f32
- %cst_0 = arith.constant 0.000000e+00 : f32
- %cst_1 = arith.constant 1.600000e+01 : f32
- %c1_i32 = arith.constant 1 : i32
- %c32_i32 = arith.constant 32 : i32
- %cst_2 = arith.constant 1.000000e+00 : f32
- %c0_i32 = arith.constant 0 : i32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c1115136) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x33x33x21xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x257x257x21xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 33, 33, 21], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x33x33x21xf32>> -> tensor<1x33x33x21xf32>
- %3 = tensor.empty() : tensor<1x257x257x21xf32>
- %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3 : tensor<1x257x257x21xf32>) {
- ^bb0(%out: f32):
- %5 = linalg.index 1 : index
- %6 = linalg.index 0 : index
- %7 = affine.apply affine_map<(d0, d1) -> (d0 + d1 * 257)>(%5, %6)
- %8 = linalg.index 2 : index
- %9 = linalg.index 3 : index
- %10 = arith.index_cast %7 : index to i32
- %11 = arith.index_cast %8 : index to i32
- %12 = arith.uitofp %10 : i32 to f32
- %13 = arith.mulf %12, %cst : f32
- %14 = arith.addf %13, %cst_0 : f32
- %15 = arith.divf %14, %cst_1 : f32
- %16 = math.floor %15 : f32
- %17 = arith.subf %15, %16 : f32
- %18 = arith.fptosi %16 : f32 to i32
- %19 = arith.uitofp %11 : i32 to f32
- %20 = arith.mulf %19, %cst : f32
- %21 = arith.addf %20, %cst_0 : f32
- %22 = arith.divf %21, %cst_1 : f32
- %23 = math.floor %22 : f32
- %24 = arith.subf %22, %23 : f32
- %25 = arith.fptosi %23 : f32 to i32
- %26 = arith.addi %18, %c1_i32 : i32
- %27 = arith.cmpi slt, %18, %c0_i32 : i32
- %28 = arith.select %27, %c0_i32, %18 : i32
- %29 = arith.cmpi sgt, %18, %c32_i32 : i32
- %30 = arith.select %29, %c32_i32, %28 : i32
- %31 = arith.cmpi slt, %26, %c0_i32 : i32
- %32 = arith.select %31, %c0_i32, %26 : i32
- %33 = arith.cmpi sgt, %26, %c32_i32 : i32
- %34 = arith.select %33, %c32_i32, %32 : i32
- %35 = arith.index_cast %30 : i32 to index
- %36 = arith.index_cast %34 : i32 to index
- %37 = arith.addi %25, %c1_i32 : i32
- %38 = arith.cmpi slt, %25, %c0_i32 : i32
- %39 = arith.select %38, %c0_i32, %25 : i32
- %40 = arith.cmpi sgt, %25, %c32_i32 : i32
- %41 = arith.select %40, %c32_i32, %39 : i32
- %42 = arith.cmpi slt, %37, %c0_i32 : i32
- %43 = arith.select %42, %c0_i32, %37 : i32
- %44 = arith.cmpi sgt, %37, %c32_i32 : i32
- %45 = arith.select %44, %c32_i32, %43 : i32
- %46 = arith.index_cast %41 : i32 to index
- %47 = arith.index_cast %45 : i32 to index
- %extracted = tensor.extract %2[%c0, %35, %46, %9] : tensor<1x33x33x21xf32>
- %extracted_3 = tensor.extract %2[%c0, %35, %47, %9] : tensor<1x33x33x21xf32>
- %extracted_4 = tensor.extract %2[%c0, %36, %46, %9] : tensor<1x33x33x21xf32>
- %extracted_5 = tensor.extract %2[%c0, %36, %47, %9] : tensor<1x33x33x21xf32>
- %48 = arith.subf %cst_2, %24 : f32
- %49 = arith.mulf %extracted, %48 : f32
- %50 = arith.mulf %extracted_3, %24 : f32
- %51 = arith.addf %49, %50 : f32
- %52 = arith.mulf %extracted_4, %48 : f32
- %53 = arith.mulf %extracted_5, %24 : f32
- %54 = arith.addf %52, %53 : f32
- %55 = arith.subf %cst_2, %17 : f32
- %56 = arith.mulf %51, %55 : f32
- %57 = arith.mulf %54, %17 : f32
- %58 = arith.addf %56, %57 : f32
- linalg.yield %58 : f32
- } -> tensor<1x257x257x21xf32>
- flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [1, 257, 257, 21], strides = [1, 1, 1, 1] : tensor<1x257x257x21xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x257x257x21xf32>>
- return
- }
- }
+#executable_target_system_elf_riscv_64_ = #hal.executable.target<"llvm-cpu", "system-elf-riscv_64", {cpu = "generic-rv64", cpu_features = "+m,+a,+f,+d,+v", data_layout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128", native_vector_size = 64 : index, target_triple = "riscv64"}>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map1 = affine_map<(d0, d1) -> (d0 + d1 * 257)>
+module {
+ func.func @main_dispatch_77_generic_1x257x257x21() attributes {hal.executable.target = #executable_target_system_elf_riscv_64_} {
+ %c1115136 = arith.constant 1115136 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 2.000000e+00 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %cst_1 = arith.constant 1.600000e+01 : f32
+ %c1_i32 = arith.constant 1 : i32
+ %c32_i32 = arith.constant 32 : i32
+ %cst_2 = arith.constant 1.000000e+00 : f32
+ %c0_i32 = arith.constant 0 : i32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c1115136) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x33x33x21xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x257x257x21xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 33, 33, 21], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x33x33x21xf32>> -> tensor<1x33x33x21xf32>
+ %3 = tensor.empty() : tensor<1x257x257x21xf32>
+ %4 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3 : tensor<1x257x257x21xf32>) {
+ ^bb0(%out: f32):
+ %5 = linalg.index 1 : index
+ %6 = linalg.index 0 : index
+ %7 = affine.apply #map1(%5, %6)
+ %8 = linalg.index 2 : index
+ %9 = linalg.index 3 : index
+ %10 = arith.index_cast %7 : index to i32
+ %11 = arith.index_cast %8 : index to i32
+ %12 = arith.uitofp %10 : i32 to f32
+ %13 = arith.mulf %12, %cst : f32
+ %14 = arith.addf %13, %cst_0 : f32
+ %15 = arith.divf %14, %cst_1 : f32
+ %16 = math.floor %15 : f32
+ %17 = arith.subf %15, %16 : f32
+ %18 = arith.fptosi %16 : f32 to i32
+ %19 = arith.uitofp %11 : i32 to f32
+ %20 = arith.mulf %19, %cst : f32
+ %21 = arith.addf %20, %cst_0 : f32
+ %22 = arith.divf %21, %cst_1 : f32
+ %23 = math.floor %22 : f32
+ %24 = arith.subf %22, %23 : f32
+ %25 = arith.fptosi %23 : f32 to i32
+ %26 = arith.addi %18, %c1_i32 : i32
+ %27 = arith.cmpi slt, %18, %c0_i32 : i32
+ %28 = arith.select %27, %c0_i32, %18 : i32
+ %29 = arith.cmpi sgt, %18, %c32_i32 : i32
+ %30 = arith.select %29, %c32_i32, %28 : i32
+ %31 = arith.cmpi slt, %26, %c0_i32 : i32
+ %32 = arith.select %31, %c0_i32, %26 : i32
+ %33 = arith.cmpi sgt, %26, %c32_i32 : i32
+ %34 = arith.select %33, %c32_i32, %32 : i32
+ %35 = arith.index_cast %30 : i32 to index
+ %36 = arith.index_cast %34 : i32 to index
+ %37 = arith.addi %25, %c1_i32 : i32
+ %38 = arith.cmpi slt, %25, %c0_i32 : i32
+ %39 = arith.select %38, %c0_i32, %25 : i32
+ %40 = arith.cmpi sgt, %25, %c32_i32 : i32
+ %41 = arith.select %40, %c32_i32, %39 : i32
+ %42 = arith.cmpi slt, %37, %c0_i32 : i32
+ %43 = arith.select %42, %c0_i32, %37 : i32
+ %44 = arith.cmpi sgt, %37, %c32_i32 : i32
+ %45 = arith.select %44, %c32_i32, %43 : i32
+ %46 = arith.index_cast %41 : i32 to index
+ %47 = arith.index_cast %45 : i32 to index
+ %extracted = tensor.extract %2[%c0, %35, %46, %9] : tensor<1x33x33x21xf32>
+ %extracted_3 = tensor.extract %2[%c0, %35, %47, %9] : tensor<1x33x33x21xf32>
+ %extracted_4 = tensor.extract %2[%c0, %36, %46, %9] : tensor<1x33x33x21xf32>
+ %extracted_5 = tensor.extract %2[%c0, %36, %47, %9] : tensor<1x33x33x21xf32>
+ %48 = arith.subf %cst_2, %24 : f32
+ %49 = arith.mulf %extracted, %48 : f32
+ %50 = arith.mulf %extracted_3, %24 : f32
+ %51 = arith.addf %49, %50 : f32
+ %52 = arith.mulf %extracted_4, %48 : f32
+ %53 = arith.mulf %extracted_5, %24 : f32
+ %54 = arith.addf %52, %53 : f32
+ %55 = arith.subf %cst_2, %17 : f32
+ %56 = arith.mulf %51, %55 : f32
+ %57 = arith.mulf %54, %17 : f32
+ %58 = arith.addf %56, %57 : f32
+ linalg.yield %58 : f32
+ } -> tensor<1x257x257x21xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [1, 257, 257, 21], strides = [1, 1, 1, 1] : tensor<1x257x257x21xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x257x257x21xf32>>
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir
index 4fb89a1..12794e3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir
@@ -1,298 +1,163 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy)))' --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' --split-input-file %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors_default {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
- }>) {
- hal.executable.export @matmul_tensors_default layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_tensors_default() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %M = hal.interface.constant.load[0] : index
- %N = hal.interface.constant.load[1] : index
- %K = hal.interface.constant.load[2] : index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %K}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%K, %N}
- %init_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %N}
- %result_binding = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%M, %N}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %K} -> tensor<?x?xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%K, %N} -> tensor<?x?xf32>
- %init = flow.dispatch.tensor.load %init_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %N} -> tensor<?x?xf32>
- %gemm = linalg.matmul ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
- : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%M, %N}
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @matmul_tensors_default() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
+ %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
+ %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
+ %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<?x?xf32>
+ %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
+ %10 = linalg.matmul ins(%7, %8 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ return
}
}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [64, 64, 0], [0, 0, 0], [8, 16, 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @matmul_tensors
+// CHECK: func.func @matmul_tensors_default()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable private @i4_i4_i32_matmul {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
- }>) {
- hal.executable.export @i4_i4_i32_matmul layout(#pipeline_layout)
- builtin.module {
- func.func @i4_i4_i32_matmul() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %M = hal.interface.constant.load[0] : index
- %N = hal.interface.constant.load[1] : index
- %K = hal.interface.constant.load[2] : index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xi4>>{%M, %K}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xi4>>{%K, %N}
- %init_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%M, %N}
- %result_binding = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%M, %N}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xi4>>{%M, %K} -> tensor<?x?xi4>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xi4>>{%K, %N} -> tensor<?x?xi4>
- %init = flow.dispatch.tensor.load %init_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%M, %N} -> tensor<?x?xi32>
- %gemm = linalg.matmul ins(%lhs, %rhs : tensor<?x?xi4>, tensor<?x?xi4>) outs(%init : tensor<?x?xi32>) -> tensor<?x?xi32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
- : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%M, %N}
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @i4_i4_i32_matmul() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xi4>>{%0, %2}
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xi4>>{%2, %1}
+ %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1}
+ %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%0, %1}
+ %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi4>>{%0, %2} -> tensor<?x?xi4>
+ %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi4>>{%2, %1} -> tensor<?x?xi4>
+ %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32>
+ %10 = linalg.matmul ins(%7, %8 : tensor<?x?xi4>, tensor<?x?xi4>) outs(%9 : tensor<?x?xi32>) -> tensor<?x?xi32>
+ flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%0, %1}
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [64, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @i4_i4_i32_matmul
+// CHECK: func.func @i4_i4_i32_matmul()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @batch_matmul_tensors {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
- }>) {
- hal.executable.export @batch_matmul_tensors layout(#pipeline_layout)
- builtin.module {
- func.func @batch_matmul_tensors() {
- %cst = arith.constant 0.000000e+00 : f32
- %B = hal.interface.constant.load[0] : index
- %M = hal.interface.constant.load[1] : index
- %N = hal.interface.constant.load[2] : index
- %K = hal.interface.constant.load[3] : index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32)
- : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%B, %M, %K}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32)
- : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%B, %K, %N}
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32)
- : !flow.dispatch.tensor<writeonly:tensor<?x?x?xf32>>{%B, %M, %N}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0, 0], sizes = [%B, %M, %K], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%B, %M, %K} -> tensor<?x?x?xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0, 0], sizes = [%B, %K, %N], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%B, %K, %N} -> tensor<?x?x?xf32>
- %init = tensor.empty(%B, %M, %N) : tensor<?x?x?xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
- %batch_gemm = linalg.batch_matmul
- ins(%lhs, %rhs : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%fill : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
- flow.dispatch.tensor.store %batch_gemm, %result_binding, offsets = [0, 0, 0], sizes = [%B, %M, %N], strides = [1, 1, 1]
- : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?xf32>>{%B, %M, %N}
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @batch_matmul_tensors() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.constant.load[3] : index
+ %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%0, %1, %3}
+ %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%0, %3, %2}
+ %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<writeonly:tensor<?x?x?xf32>>{%0, %1, %2}
+ %7 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0], sizes = [%0, %1, %3], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%0, %1, %3} -> tensor<?x?x?xf32>
+ %8 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0], sizes = [%0, %3, %2], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?xf32>>{%0, %3, %2} -> tensor<?x?x?xf32>
+ %9 = tensor.empty(%0, %1, %2) : tensor<?x?x?xf32>
+ %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+ %11 = linalg.batch_matmul ins(%7, %8 : tensor<?x?x?xf32>, tensor<?x?x?xf32>) outs(%10 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+ flow.dispatch.tensor.store %11, %6, offsets = [0, 0, 0], sizes = [%0, %1, %2], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?xf32>>{%0, %1, %2}
+ return
}
}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 64, 0], [1, 64, 64, 0], [0, 0, 0, 0], [1, 8, 16, 0], [0, 0, 0, 1], [0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @batch_matmul_tensors
+// CHECK: func.func @batch_matmul_tensors()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.batch_matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_static {
- hal.executable.variant public @system_elf_arm_64 target(<"llvm-cpu", "system-elf-arm_64", {
- data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-linux-android30"
- }>) {
- hal.executable.export public @matmul_static layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_static() {
- %cst = arith.constant 0.0 : f32
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<196x240xf32>>
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<240x40xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<196x40xf32>>
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [196, 240], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<196x240xf32>> -> tensor<196x240xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [240, 40], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<240x40xf32>> -> tensor<240x40xf32>
- %init = tensor.empty() : tensor<196x40xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<196x40xf32>) -> tensor<196x40xf32>
- %gemm = linalg.matmul ins(%lhs, %rhs : tensor<196x240xf32>, tensor<240x40xf32>)
- outs(%fill : tensor<196x40xf32>) -> tensor<196x40xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [196, 40], strides = [1, 1]
- : tensor<196x40xf32> -> !flow.dispatch.tensor<writeonly:tensor<196x40xf32>>
- return
- }
- }
+#executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}>
+module {
+ func.func @matmul_static() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<196x240xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<240x40xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<196x40xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [196, 240], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<196x240xf32>> -> tensor<196x240xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [240, 40], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<240x40xf32>> -> tensor<240x40xf32>
+ %5 = tensor.empty() : tensor<196x40xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<196x40xf32>) -> tensor<196x40xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<196x240xf32>, tensor<240x40xf32>) outs(%6 : tensor<196x40xf32>) -> tensor<196x40xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [196, 40], strides = [1, 1] : tensor<196x40xf32> -> !flow.dispatch.tensor<writeonly:tensor<196x40xf32>>
+ return
}
}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[28, 20, 0], [28, 20, 0], [0, 0, 0], [8, 16, 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @matmul_static
+// CHECK: func.func @matmul_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @conv_static {
- hal.executable.variant public @system_elf_arm_64 target(<"llvm-cpu", "system-elf-arm_64", {
- data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-linux-android30"
- }>) {
- hal.executable.export public @conv_static layout(#pipeline_layout)
- builtin.module {
- func.func @conv_static() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %c607520 = arith.constant 607520 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x51x41x512xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c607520) : !flow.dispatch.tensor<readonly:tensor<3x3x512x512xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x25x20x512xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 51, 41, 512], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x51x41x512xf32>> -> tensor<1x51x41x512xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 512, 512], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x512x512xf32>> -> tensor<3x3x512x512xf32>
- %5 = tensor.empty() : tensor<1x25x20x512xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x25x20x512xf32>) -> tensor<1x25x20x512xf32>
- %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x51x41x512xf32>, tensor<3x3x512x512xf32>) outs(%6 : tensor<1x25x20x512xf32>) -> tensor<1x25x20x512xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 25, 20, 512], strides = [1, 1, 1, 1] : tensor<1x25x20x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x25x20x512xf32>>
- return
- }
- }
+#executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}>
+module {
+ func.func @conv_static() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %c607520 = arith.constant 607520 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x51x41x512xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c607520) : !flow.dispatch.tensor<readonly:tensor<3x3x512x512xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x25x20x512xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 51, 41, 512], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x51x41x512xf32>> -> tensor<1x51x41x512xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 512, 512], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x512x512xf32>> -> tensor<3x3x512x512xf32>
+ %5 = tensor.empty() : tensor<1x25x20x512xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x25x20x512xf32>) -> tensor<1x25x20x512xf32>
+ %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x51x41x512xf32>, tensor<3x3x512x512xf32>) outs(%6 : tensor<1x25x20x512xf32>) -> tensor<1x25x20x512xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 25, 20, 512], strides = [1, 1, 1, 1] : tensor<1x25x20x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x25x20x512xf32>>
+ return
}
}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 5, 20, 64, 0, 0, 0], [1, 1, 20, 64, 0, 0, 0], [0, 0, 0, 0, 1, 1, 16], [0, 0, 0, 0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
-// CHECK: hal.executable.export public @conv_static
+// CHECK: func.func @conv_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.conv_2d_nhwc_hwcf
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @restrict_num_workgroups {
- hal.executable.variant public @system_elf_arm_64 target(<"llvm-cpu", "system-elf-arm_64", {
- data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-linux-android30"
- }>) {
- hal.executable.export public @restrict_num_workgroups layout(#pipeline_layout)
- builtin.module {
- func.func @restrict_num_workgroups() {
- %cst = arith.constant 0.000000e+00 : f32
- %input_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<1x11x11x576xf32>>
- %filter_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<5x5x576xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<1x7x7x576xf32>>
- %input = flow.dispatch.tensor.load %input_binding, offsets = [0, 0, 0, 0], sizes = [1, 11, 11, 576], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x11x11x576xf32>> -> tensor<1x11x11x576xf32>
- %filter = flow.dispatch.tensor.load %filter_binding, offsets = [0, 0, 0], sizes = [5, 5, 576], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<5x5x576xf32>> -> tensor<5x5x576xf32>
- %init = tensor.empty() : tensor<1x7x7x576xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32>
- %conv = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
- ins(%input, %filter : tensor<1x11x11x576xf32>, tensor<5x5x576xf32>)
- outs(%fill : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32>
- flow.dispatch.tensor.store %conv, %result_binding, offsets = [0, 0, 0, 0], sizes = [1, 7, 7, 576], strides = [1, 1, 1, 1]
- : tensor<1x7x7x576xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x7x7x576xf32>>
- return
- }
- }
+#executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}>
+module {
+ func.func @restrict_num_workgroups() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x11x11x576xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<5x5x576xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x7x7x576xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 11, 11, 576], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x11x11x576xf32>> -> tensor<1x11x11x576xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [5, 5, 576], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<5x5x576xf32>> -> tensor<5x5x576xf32>
+ %5 = tensor.empty() : tensor<1x7x7x576xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32>
+ %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<1x11x11x576xf32>, tensor<5x5x576xf32>) outs(%6 : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 7, 7, 576], strides = [1, 1, 1, 1] : tensor<1x7x7x576xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x7x7x576xf32>>
+ return
}
}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 7, 7, 64, 0, 0], [1, 1, 1, 4, 0, 0], [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
-// CHECK: hal.executable.export public @restrict_num_workgroups
+// CHECK: func.func @restrict_num_workgroups()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.depthwise_conv_2d_nhwc_hwc
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -300,176 +165,107 @@
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_aarch_i8_i8_i32_static {
- hal.executable.variant public @system_elf_arm_64 target(<"llvm-cpu", "system-elf-arm_64", {
- data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-linux-android30"
- }>) {
- hal.executable.export public @matmul_aarch_i8_i8_i32_static layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_aarch_i8_i8_i32_static() {
- %c0_i32 = arith.constant 0 : i32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x384xi8>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<384x1536xi8>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x1536xi32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x384xi8>> -> tensor<128x384xi8>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 1536], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x1536xi8>> -> tensor<384x1536xi8>
- %5 = tensor.empty() : tensor<128x1536xi32>
- %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x1536xi32>) -> tensor<128x1536xi32>
- %7 = linalg.matmul ins(%3, %4 : tensor<128x384xi8>, tensor<384x1536xi8>) outs(%6 : tensor<128x1536xi32>) -> tensor<128x1536xi32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1536], strides = [1, 1] : tensor<128x1536xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x1536xi32>>
- return
- }
- }
+#executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}>
+module {
+ func.func @matmul_aarch_i8_i8_i32_static() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} {
+ %c0_i32 = arith.constant 0 : i32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x384xi8>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<384x1536xi8>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x1536xi32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x384xi8>> -> tensor<128x384xi8>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 1536], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x1536xi8>> -> tensor<384x1536xi8>
+ %5 = tensor.empty() : tensor<128x1536xi32>
+ %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x1536xi32>) -> tensor<128x1536xi32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<128x384xi8>, tensor<384x1536xi8>) outs(%6 : tensor<128x1536xi32>) -> tensor<128x1536xi32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1536], strides = [1, 1] : tensor<128x1536xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x1536xi32>>
+ return
}
}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [64, 64, 0], [0, 0, 0], [8, 16, 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @matmul_aarch_i8_i8_i32_static
+// CHECK: func.func @matmul_aarch_i8_i8_i32_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_aarch_i8_i8_i32_dynamic {
- hal.executable.variant public @system_elf_arm_64 target(<"llvm-cpu", "system-elf-arm_64", {
- data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-linux-android30"
- }>) {
- hal.executable.export public @matmul_aarch_i8_i8_i32_dynamic layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_aarch_i8_i8_i32_dynamic() {
- %c0 = arith.constant 0 : index
- %M = hal.interface.constant.load[0] : index
- %N = hal.interface.constant.load[1] : index
- %K = hal.interface.constant.load[2] : index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0)
- : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%M, %K}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0)
- : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%K, %N}
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0)
- : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%M, %N}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%M, %K} -> tensor<?x?xi8>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%K, %N} -> tensor<?x?xi8>
- %init = flow.dispatch.tensor.load %result_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
- : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%M, %N} -> tensor<?x?xi32>
- %gemm = linalg.matmul ins(%lhs, %rhs : tensor<?x?xi8>, tensor<?x?xi8>) outs(%init : tensor<?x?xi32>) -> tensor<?x?xi32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
- : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%M, %N}
- return
- }
- }
+#executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}>
+module {
+ func.func @matmul_aarch_i8_i8_i32_dynamic() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %2}
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%2, %1}
+ %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%0, %1}
+ %6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%0, %2} -> tensor<?x?xi8>
+ %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi8>>{%2, %1} -> tensor<?x?xi8>
+ %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%0, %1} -> tensor<?x?xi32>
+ %9 = linalg.matmul ins(%6, %7 : tensor<?x?xi8>, tensor<?x?xi8>) outs(%8 : tensor<?x?xi32>) -> tensor<?x?xi32>
+ flow.dispatch.tensor.store %9, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32>>{%0, %1}
+ return
}
}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [64, 64, 0], [0, 0, 0], [8, 16, 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @matmul_aarch_i8_i8_i32_dynamic
+// CHECK: func.func @matmul_aarch_i8_i8_i32_dynamic()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @pack {
- hal.executable.variant public @system_elf_arm_64 target(<"llvm-cpu", "system-elf-arm_64", {
- data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-linux-android30"
- }>) {
- hal.executable.export public @pack layout(#pipeline_layout)
- builtin.module {
- func.func @pack() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<20x40xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4x48x8x1xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [20, 40], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<20x40xf32>> -> tensor<20x40xf32>
- %3 = tensor.empty() : tensor<4x48x8x1xf32>
- %4 = tensor.pack %2 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %3 : tensor<20x40xf32> -> tensor<4x48x8x1xf32>
- flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [4, 48, 8, 1], strides = [1, 1, 1, 1] : tensor<4x48x8x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x48x8x1xf32>>
- return
- }
- }
+#executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}>
+module {
+ func.func @pack() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<20x40xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4x48x8x1xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [20, 40], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<20x40xf32>> -> tensor<20x40xf32>
+ %3 = tensor.empty() : tensor<4x48x8x1xf32>
+ %pack = tensor.pack %2 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %3 : tensor<20x40xf32> -> tensor<4x48x8x1xf32>
+ flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [4, 48, 8, 1], strides = [1, 1, 1, 1] : tensor<4x48x8x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x48x8x1xf32>>
+ return
}
}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 40], [1, 1]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
-// CHECK: hal.executable.export public @pack
+// CHECK: func.func @pack()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: tensor.pack
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @unpack_outer_dynamic {
- hal.executable.variant public @system_elf_arm_64 target(<"llvm-cpu", "system-elf-arm_64", {
- data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-linux-android30"
- }>) {
- hal.executable.export public @unpack_outer_dynamic layout(#pipeline_layout)
- builtin.module {
- func.func @unpack_outer_dynamic() {
- %c131072 = arith.constant 131072 : index
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = arith.index_castui %0 : i32 to index
- %5 = arith.index_castui %1 : i32 to index
- %6 = arith.index_castui %2 : i32 to index
- %7 = arith.index_castui %3 : i32 to index
- %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x?x32x16xi32>>{%4, %5}
- %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c131072) : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%6, %7}
- %10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 32, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x32x16xi32>>{%4, %5} -> tensor<?x?x32x16xi32>
- %11 = tensor.empty(%6, %7) : tensor<?x?xi32>
- %12 = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %11 : tensor<?x?x32x16xi32> -> tensor<?x?xi32>
- flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%6, %7}
- return
- }
- }
+#executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}>
+module {
+ func.func @unpack_outer_dynamic() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} {
+ %c131072 = arith.constant 131072 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = arith.index_castui %0 : i32 to index
+ %5 = arith.index_castui %1 : i32 to index
+ %6 = arith.index_castui %2 : i32 to index
+ %7 = arith.index_castui %3 : i32 to index
+ %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x?x32x16xi32>>{%4, %5}
+ %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c131072) : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%6, %7}
+ %10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 32, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x32x16xi32>>{%4, %5} -> tensor<?x?x32x16xi32>
+ %11 = tensor.empty(%6, %7) : tensor<?x?xi32>
+ %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %11 : tensor<?x?x32x16xi32> -> tensor<?x?xi32>
+ flow.dispatch.tensor.store %unpack, %9, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%6, %7}
+ return
}
}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [32, 16]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
-// CHECK: hal.executable.export public @unpack_outer_dynamic
+// CHECK: func.func @unpack_outer_dynamic()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: tensor.unpack
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -477,44 +273,22 @@
// -----
#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
-#map3 = affine_map<(d0)[s0] -> (s0, -d0 + 96)>
-#map4 = affine_map<(d0)[s0] -> (s0, -d0 + 128)>
-hal.executable private @mmt4d_384x384x512_4x1x4_dispatch_0 {
- hal.executable.variant public @embedded_elf_arm_64 target(#executable_target_embedded_elf_arm_64_) {
- hal.executable.export public @mmt4d_384x384x512_4x1x4_dispatch_0 layout(#pipeline_layout)
- builtin.module {
- func.func @mmt4d_384x384x512_4x1x4_dispatch_0() {
- %c0 = arith.constant 0 : index
- %c96 = arith.constant 96 : index
- %c128 = arith.constant 128 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<96x384x4x1xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x384x4x1xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<96x128x4x4xf32>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [96, 384, 4, 1], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<96x384x4x1xf32>> -> tensor<96x384x4x1xf32>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [128, 384, 4, 1], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<128x384x4x1xf32>> -> tensor<128x384x4x1xf32>
- %11 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [96, 384, 4, 4], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readwrite:tensor<96x128x4x4xf32>> -> tensor<96x128x4x4xf32>
- %12 = linalg.mmt4d
- ins(%8, %10 : tensor<96x384x4x1xf32>, tensor<128x384x4x1xf32>)
- outs(%11 : tensor<96x128x4x4xf32>) -> tensor<96x128x4x4xf32>
- flow.dispatch.tensor.store %12, %2, offsets = [0, 0, 0, 0], sizes = [96, 128, 4, 4], strides = [1, 1, 1, 1]
- : tensor<96x128x4x4xf32> -> !flow.dispatch.tensor<readwrite:tensor<96x128x4x4xf32>>
- return
- }
- }
+module {
+ func.func @mmt4d_384x384x512_4x1x4_dispatch_0() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+ %c0 = arith.constant 0 : index
+ %c96 = arith.constant 96 : index
+ %c128 = arith.constant 128 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<96x384x4x1xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x384x4x1xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<96x128x4x4xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [96, 384, 4, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<96x384x4x1xf32>> -> tensor<96x384x4x1xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [128, 384, 4, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x384x4x1xf32>> -> tensor<128x384x4x1xf32>
+ %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [96, 384, 4, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<96x128x4x4xf32>> -> tensor<96x128x4x4xf32>
+ %6 = linalg.mmt4d ins(%3, %4 : tensor<96x384x4x1xf32>, tensor<128x384x4x1xf32>) outs(%5 : tensor<96x128x4x4xf32>) -> tensor<96x128x4x4xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [96, 128, 4, 4], strides = [1, 1, 1, 1] : tensor<96x128x4x4xf32> -> !flow.dispatch.tensor<readwrite:tensor<96x128x4x4xf32>>
+ return
}
}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 16, 0, 0, 0, 0], [16, 16, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 0, 4, 4, 0], [0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0]{{\]}}
// CHECK: func.func @mmt4d_384x384x512_4x1x4_dispatch_0()
// CHECK: linalg.mmt4d
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir
index 8de9bbe..7750a3e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir
@@ -1,268 +1,167 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy)))' \
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' \
// RUN: --iree-llvmcpu-enable-scalable-vectorization=true --split-input-file %s | FileCheck %s --check-prefixes=CHECK,WITH-SME
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy)))' \
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' \
// RUN: --iree-llvmcpu-enable-scalable-vectorization=true --split-input-file --iree-experimental-llvmcpu-arm-force-ssve=true %s | FileCheck %s --check-prefixes=CHECK,SSVE-WITHOUT-SME
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors_sve {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- cpu_features = "+sve",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
- }>) {
- hal.executable.export @matmul_tensors layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_tensors() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %M = hal.interface.constant.load[0] : index
- %N = hal.interface.constant.load[1] : index
- %K = hal.interface.constant.load[2] : index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %K}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%K, %N}
- %init_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %N}
- %result_binding = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%M, %N}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %K} -> tensor<?x?xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%K, %N} -> tensor<?x?xf32>
- %init = flow.dispatch.tensor.load %init_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %N} -> tensor<?x?xf32>
- %gemm = linalg.matmul ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
- : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%M, %N}
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @matmul_tensors() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
+ %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
+ %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
+ %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<?x?xf32>
+ %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
+ %10 = linalg.matmul ins(%7, %8 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, [16], 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @matmul_tensors
+// CHECK: func.func @matmul_tensors()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_static_tensors_sve {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- cpu_features = "+sve",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
- }>) {
- hal.executable.export @static_tensors_non_pow_two_sizes layout(#pipeline_layout)
- builtin.module {
- func.func @static_tensors_non_pow_two_sizes() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<15x14xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<14x7xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<15x7xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [15, 14], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<15x14xf32>> -> tensor<15x14xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [14, 7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<14x7xf32>> -> tensor<14x7xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<15x7xf32>> -> tensor<15x7xf32>
- %6 = linalg.matmul ins(%3, %4 : tensor<15x14xf32>, tensor<14x7xf32>) outs(%5 : tensor<15x7xf32>) -> tensor<15x7xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : tensor<15x7xf32> -> !flow.dispatch.tensor<readwrite:tensor<15x7xf32>> return }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @static_tensors_non_pow_two_sizes() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<15x14xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<14x7xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<15x7xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [15, 14], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<15x14xf32>> -> tensor<15x14xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [14, 7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<14x7xf32>> -> tensor<14x7xf32>
+ %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<15x7xf32>> -> tensor<15x7xf32>
+ %6 = linalg.matmul ins(%3, %4 : tensor<15x14xf32>, tensor<14x7xf32>) outs(%5 : tensor<15x7xf32>) -> tensor<15x7xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : tensor<15x7xf32> -> !flow.dispatch.tensor<readwrite:tensor<15x7xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[5, 7, 0], [5, [8], 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @static_tensors_non_pow_two_sizes
+// CHECK: func.func @static_tensors_non_pow_two_sizes()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable private @static_tensors_1x1 {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- cpu_features = "+sve",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
- }>) {
- hal.executable.export @static_tensors_1x1 layout(#pipeline_layout)
- builtin.module {
- func.func @static_tensors_1x1() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1xf32>> -> tensor<1x1xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1xf32>> -> tensor<1x1xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>> -> tensor<1x1xf32>
- %6 = linalg.matmul ins(%3, %4 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @static_tensors_1x1() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1xf32>> -> tensor<1x1xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1xf32>> -> tensor<1x1xf32>
+ %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>> -> tensor<1x1xf32>
+ %6 = linalg.matmul ins(%3, %4 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0], [1, 1, 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @static_tensors_1x1
+// CHECK: func.func @static_tensors_1x1()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors_sme {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- cpu_features = "+sve,+sme",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
- }>) {
- hal.executable.export @matmul_tensors layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_tensors() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %M = hal.interface.constant.load[0] : index
- %N = hal.interface.constant.load[1] : index
- %K = hal.interface.constant.load[2] : index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %K}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%K, %N}
- %init_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %N}
- %result_binding = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%M, %N}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %K} -> tensor<?x?xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%K, %N} -> tensor<?x?xf32>
- %init = flow.dispatch.tensor.load %init_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %N} -> tensor<?x?xf32>
- %gemm = linalg.matmul ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
- : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%M, %N}
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @matmul_tensors() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
+ %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
+ %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
+ %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<?x?xf32>
+ %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
+ %10 = linalg.matmul ins(%7, %8 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ return
}
}
// SSVE-WITHOUT-SME-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, [16], 0], [0, 0, 1], [0, 0, 0]]>
// SSVE-WITHOUT-SME-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// SSVE-WITHOUT-SME: hal.executable.export public @matmul_tensors
+// SSVE-WITHOUT-SME: func.func @matmul_tensors()
// SSVE-WITHOUT-SME-SAME: translation_info = #[[TRANSLATION]]
// SSVE-WITHOUT-SME: linalg.matmul
// SSVE-WITHOUT-SME-SAME: lowering_config = #[[CONFIG]]
// WITH-SME-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], {{\[}}[4], [4], 0], [0, 0, 1], [0, 0, 0]]>
// WITH-SME-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// WITH-SME: hal.executable.export public @matmul_tensors
+// WITH-SME: func.func @matmul_tensors()
// WITH-SME-SAME: translation_info = #[[TRANSLATION]]
// WITH-SME: linalg.matmul
// WITH-SME-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 5, sets = [
- <0, bindings = [
- <0, storage_buffer, ReadOnly>,
- <1, storage_buffer, ReadOnly>,
- <2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_with_fill {
- hal.executable.variant @llvm target(<"llvm-cpu", "system-elf-arm_64", {
- cpu = "",
- cpu_features = "+v9a,+sve",
- data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128",
- link_embedded = false,
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-linux-android34"
- }>) {
- hal.executable.export @matmul_with_fill layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_with_fill() {
- %c0_i32 = arith.constant 0 : i32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = hal.interface.constant.load[4] : i32
- %5 = arith.index_castui %0 : i32 to index
- %6 = arith.index_castui %1 : i32 to index
- %7 = arith.index_castui %2 : i32 to index
- %8 = arith.index_castui %3 : i32 to index
- %9 = arith.index_castui %4 : i32 to index
- %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x256xi8>>
- %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xi8>>
- %12 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024xf32>>
- %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
- %14 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%9) : !flow.dispatch.tensor<writeonly:tensor<1024x256xf32>>
- %15 = flow.dispatch.tensor.load %10, offsets = [0, 0], sizes = [1024, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x256xi8>> -> tensor<1024x256xi8>
- %16 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xi8>> -> tensor<256x256xi8>
- %17 = flow.dispatch.tensor.load %12, offsets = [0], sizes = [1024], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1024xf32>> -> tensor<1024xf32>
- %18 = flow.dispatch.tensor.load %13, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
- %19 = tensor.empty() : tensor<1024x256xf32>
- %20 = tensor.empty() : tensor<1024x256xi32>
- %21 = linalg.fill ins(%c0_i32 : i32) outs(%20 : tensor<1024x256xi32>) -> tensor<1024x256xi32>
- %22 = linalg.matmul ins(%15, %16 : tensor<1024x256xi8>, tensor<256x256xi8>) outs(%21 : tensor<1024x256xi32>) -> tensor<1024x256xi32>
- %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%22, %17, %18 : tensor<1024x256xi32>, tensor<1024xf32>, tensor<256xf32>) outs(%19 : tensor<1024x256xf32>) {
- ^bb0(%in: i32, %in_0: f32, %in_1: f32, %out: f32):
- %24 = arith.sitofp %in : i32 to f32
- %25 = arith.mulf %24, %in_0 : f32
- %26 = arith.mulf %25, %in_1 : f32
- linalg.yield %26 : f32
- } -> tensor<1024x256xf32>
- flow.dispatch.tensor.store %23, %14, offsets = [0, 0], sizes = [1024, 256], strides = [1, 1] : tensor<1024x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x256xf32>>
- return
- }
- }
+#executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {cpu = "", cpu_features = "+v9a,+sve", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", link_embedded = false, native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android34"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+#map2 = affine_map<(d0, d1) -> (d1)>
+module {
+ func.func @matmul_with_fill() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} {
+ %c0_i32 = arith.constant 0 : i32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = hal.interface.constant.load[4] : i32
+ %5 = arith.index_castui %0 : i32 to index
+ %6 = arith.index_castui %1 : i32 to index
+ %7 = arith.index_castui %2 : i32 to index
+ %8 = arith.index_castui %3 : i32 to index
+ %9 = arith.index_castui %4 : i32 to index
+ %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x256xi8>>
+ %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xi8>>
+ %12 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024xf32>>
+ %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
+ %14 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%9) : !flow.dispatch.tensor<writeonly:tensor<1024x256xf32>>
+ %15 = flow.dispatch.tensor.load %10, offsets = [0, 0], sizes = [1024, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x256xi8>> -> tensor<1024x256xi8>
+ %16 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xi8>> -> tensor<256x256xi8>
+ %17 = flow.dispatch.tensor.load %12, offsets = [0], sizes = [1024], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1024xf32>> -> tensor<1024xf32>
+ %18 = flow.dispatch.tensor.load %13, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
+ %19 = tensor.empty() : tensor<1024x256xf32>
+ %20 = tensor.empty() : tensor<1024x256xi32>
+ %21 = linalg.fill ins(%c0_i32 : i32) outs(%20 : tensor<1024x256xi32>) -> tensor<1024x256xi32>
+ %22 = linalg.matmul ins(%15, %16 : tensor<1024x256xi8>, tensor<256x256xi8>) outs(%21 : tensor<1024x256xi32>) -> tensor<1024x256xi32>
+ %23 = linalg.generic {indexing_maps = [#map, #map1, #map2, #map], iterator_types = ["parallel", "parallel"]} ins(%22, %17, %18 : tensor<1024x256xi32>, tensor<1024xf32>, tensor<256xf32>) outs(%19 : tensor<1024x256xf32>) {
+ ^bb0(%in: i32, %in_0: f32, %in_1: f32, %out: f32):
+ %24 = arith.sitofp %in : i32 to f32
+ %25 = arith.mulf %24, %in_0 : f32
+ %26 = arith.mulf %25, %in_1 : f32
+ linalg.yield %26 : f32
+ } -> tensor<1024x256xf32>
+ flow.dispatch.tensor.store %23, %14, offsets = [0, 0], sizes = [1024, 256], strides = [1, 1] : tensor<1024x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x256xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [8, [16]], [0, 0], [0, 0]]>
// CHECK-DAG: #[[CONFIG2:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, [16], 0], [0, 0, 1], [0, 0, 0]]>
// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @matmul_with_fill
+// CHECK: func.func @matmul_with_fill()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG1]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy_peeling.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy_peeling.mlir
index b8c5156..2fb8d62 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy_peeling.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy_peeling.mlir
@@ -1,139 +1,80 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy)))' \
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' \
// RUN: --iree-llvmcpu-enable-scalable-vectorization=true --iree-llvmcpu-vector-pproc-strategy=peel \
// RUN: --split-input-file %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors_sve {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- cpu_features = "+sve",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
- }>) {
- hal.executable.export @matmul_tensors layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_tensors() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %M = hal.interface.constant.load[0] : index
- %N = hal.interface.constant.load[1] : index
- %K = hal.interface.constant.load[2] : index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %K}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%K, %N}
- %init_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %N}
- %result_binding = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%M, %N}
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %K} -> tensor<?x?xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%K, %N} -> tensor<?x?xf32>
- %init = flow.dispatch.tensor.load %init_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %N} -> tensor<?x?xf32>
- %gemm = linalg.matmul ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
- : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%M, %N}
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @matmul_tensors() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
+ %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
+ %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
+ %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<?x?xf32>
+ %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
+ %10 = linalg.matmul ins(%7, %8 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%9 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [64, 64, 0], [0, 0, 0], [8, [16], 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @matmul_tensors
+// CHECK: func.func @matmul_tensors()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_static_tensors_sve {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- cpu_features = "+sve",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
- }>) {
- hal.executable.export @static_tensors_non_pow_two_sizes layout(#pipeline_layout)
- builtin.module {
- func.func @static_tensors_non_pow_two_sizes() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<15x14xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<14x7xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<15x7xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [15, 14], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<15x14xf32>> -> tensor<15x14xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [14, 7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<14x7xf32>> -> tensor<14x7xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<15x7xf32>> -> tensor<15x7xf32>
- %6 = linalg.matmul ins(%3, %4 : tensor<15x14xf32>, tensor<14x7xf32>) outs(%5 : tensor<15x7xf32>) -> tensor<15x7xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : tensor<15x7xf32> -> !flow.dispatch.tensor<readwrite:tensor<15x7xf32>> return }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @static_tensors_non_pow_two_sizes() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<15x14xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<14x7xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<15x7xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [15, 14], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<15x14xf32>> -> tensor<15x14xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [14, 7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<14x7xf32>> -> tensor<14x7xf32>
+ %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<15x7xf32>> -> tensor<15x7xf32>
+ %6 = linalg.matmul ins(%3, %4 : tensor<15x14xf32>, tensor<14x7xf32>) outs(%5 : tensor<15x7xf32>) -> tensor<15x7xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : tensor<15x7xf32> -> !flow.dispatch.tensor<readwrite:tensor<15x7xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[5, 7, 0], [5, 7, 0], [0, 0, 0], [8, [16], 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @static_tensors_non_pow_two_sizes
+// CHECK: func.func @static_tensors_non_pow_two_sizes()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable private @static_tensors_1x1 {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- cpu_features = "+sve",
- native_vector_size = 16 : index,
- target_triple = "aarch64-none-elf"
- }>) {
- hal.executable.export @static_tensors_1x1 layout(#pipeline_layout)
- builtin.module {
- func.func @static_tensors_1x1() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1xf32>> -> tensor<1x1xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1xf32>> -> tensor<1x1xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>> -> tensor<1x1xf32>
- %6 = linalg.matmul ins(%3, %4 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+module {
+ func.func @static_tensors_1x1() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1xf32>> -> tensor<1x1xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1xf32>> -> tensor<1x1xf32>
+ %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>> -> tensor<1x1xf32>
+ %6 = linalg.matmul ins(%3, %4 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
+ return
}
}
// TODO: FIXME - scalable "16" ([16]) for just 1 element
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0], [0, 0, 0], [0, 0, 0], [1, [16], 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @static_tensors_1x1
+// CHECK: func.func @static_tensors_1x1()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_lowering_strategy_without_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_lowering_strategy_without_distribution.mlir
index cd65e29..69792af 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_lowering_strategy_without_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_lowering_strategy_without_distribution.mlir
@@ -1,47 +1,27 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy)))' --iree-llvmcpu-disable-distribution --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' --iree-llvmcpu-disable-distribution --split-input-file %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_static {
- hal.executable.variant public @embedded_elf_x86_64 target(#hal.executable.target<
- "llvm-cpu",
- "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "x86_64-none-elf"
- }>) {
- hal.executable.export public @matmul_static layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_static() {
- %cst = arith.constant 0.0 : f32
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<384x512xf32>>
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x128xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [384, 512], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<384x512xf32>> -> tensor<384x512xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [512, 128], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<512x128xf32>> -> tensor<512x128xf32>
- %init = tensor.empty() : tensor<384x128xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<384x128xf32>) -> tensor<384x128xf32>
- %gemm = linalg.matmul ins(%lhs, %rhs : tensor<384x512xf32>, tensor<512x128xf32>)
- outs(%fill : tensor<384x128xf32>) -> tensor<384x128xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [384, 128], strides = [1, 1]
- : tensor<384x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @matmul_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<384x512xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x128xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x512xf32>> -> tensor<384x512xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x128xf32>> -> tensor<512x128xf32>
+ %5 = tensor.empty() : tensor<384x128xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<384x128xf32>) -> tensor<384x128xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<384x512xf32>, tensor<512x128xf32>) outs(%6 : tensor<384x128xf32>) -> tensor<384x128xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0], [8, 32], [0, 0], [0, 0]]>
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0], [0, 0, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @matmul_static
+// CHECK: func.func @matmul_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_riscv_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_riscv_lowering_strategy.mlir
index fca8f1a..9157ed2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_riscv_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_riscv_lowering_strategy.mlir
@@ -1,100 +1,51 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy)))' --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' --split-input-file %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_riscv {
- hal.executable.variant public @embedded_elf_x86_64 target(#hal.executable.target<
- "llvm-cpu",
- "embedded-elf-riscv_32", {
- cpu_features = "+m,+f",
- data_layout = "e-m:e-p:32:32-i64:64-n32-S128",
- native_vector_size = 16 : index,
- target_triple = "riscv32-none-elf"
- }>) {
- hal.executable.export public @matmul_riscv layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_riscv() {
- %cst = arith.constant 0.0 : f32
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<384x512xf32>>
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x128xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [384, 512], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<384x512xf32>> -> tensor<384x512xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [512, 128], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<512x128xf32>> -> tensor<512x128xf32>
- %init = tensor.empty() : tensor<384x128xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<384x128xf32>) -> tensor<384x128xf32>
- %gemm = linalg.matmul ins(%lhs, %rhs : tensor<384x512xf32>, tensor<512x128xf32>)
- outs(%fill : tensor<384x128xf32>) -> tensor<384x128xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [384, 128], strides = [1, 1]
- : tensor<384x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
- return
- }
- }
+#executable_target_embedded_elf_riscv_32_ = #hal.executable.target<"llvm-cpu", "embedded-elf-riscv_32", {cpu_features = "+m,+f", data_layout = "e-m:e-p:32:32-i64:64-n32-S128", native_vector_size = 16 : index, target_triple = "riscv32-none-elf"}>
+module {
+ func.func @matmul_riscv() attributes {hal.executable.target = #executable_target_embedded_elf_riscv_32_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<384x512xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x128xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x512xf32>> -> tensor<384x512xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x128xf32>> -> tensor<512x128xf32>
+ %5 = tensor.empty() : tensor<384x128xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<384x128xf32>) -> tensor<384x128xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<384x512xf32>, tensor<512x128xf32>) outs(%6 : tensor<384x128xf32>) -> tensor<384x128xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [8, 32], [0, 0], [0, 0]]>
// CHECK-DAG: #[[CONFIG2:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [64, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @matmul_riscv
+// CHECK: func.func @matmul_riscv()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG2]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @thin_depthwise_conv_static {
- hal.executable.variant public @embedded_elf_x86_64 target(#hal.executable.target<
- "llvm-cpu",
- "embedded-elf-riscv_32", {
- cpu_features = "+m,+f",
- data_layout = "e-m:e-p:32:32-i64:64-n32-S128",
- native_vector_size = 16 : index,
- target_triple = "riscv32-none-elf"
- }>) {
- hal.executable.export public @thin_depthwise_conv_static layout(#pipeline_layout)
- builtin.module {
- func.func @thin_depthwise_conv_static() {
- %cst = arith.constant 0.0 : f32
- %input_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<1x57x57x72xf32>>
- %filter_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<3x3x72xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<1x28x28x72xf32>>
- %input = flow.dispatch.tensor.load %input_binding, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 240], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x57x57x72xf32>> -> tensor<1x57x57x72xf32>
- %filter = flow.dispatch.tensor.load %filter_binding, offsets = [0, 0, 0], sizes = [3, 3, 240], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x72xf32>> -> tensor<3x3x72xf32>
- %init = tensor.empty() : tensor<1x28x28x72xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32>
- %conv = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%input, %filter : tensor<1x57x57x72xf32>, tensor<3x3x72xf32>)
- outs(%fill : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32>
-
- flow.dispatch.tensor.store %conv, %result_binding, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 72], strides = [1, 1, 1, 1]
- : tensor<1x28x28x72xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x28x28x72xf32>>
- return
- }
- }
+#executable_target_embedded_elf_riscv_32_ = #hal.executable.target<"llvm-cpu", "embedded-elf-riscv_32", {cpu_features = "+m,+f", data_layout = "e-m:e-p:32:32-i64:64-n32-S128", native_vector_size = 16 : index, target_triple = "riscv32-none-elf"}>
+module {
+ func.func @thin_depthwise_conv_static() attributes {hal.executable.target = #executable_target_embedded_elf_riscv_32_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x57x57x72xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x72xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x28x28x72xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 240], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x57x57x72xf32>> -> tensor<1x57x57x72xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 240], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x72xf32>> -> tensor<3x3x72xf32>
+ %5 = tensor.empty() : tensor<1x28x28x72xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32>
+ %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x57x57x72xf32>, tensor<3x3x72xf32>) outs(%6 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 72], strides = [1, 1, 1, 1] : tensor<1x28x28x72xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x28x28x72xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 7, 7, 72, 0, 0], [1, 1, 7, 4, 0, 0], [0, 0, 0, 0, 1, 3], [0, 0, 0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
-// CHECK: hal.executable.export public @thin_depthwise_conv_static
+// CHECK: func.func @thin_depthwise_conv_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.depthwise_conv_2d_nhwc_hwc
// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
index 7af1e58..4708080 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
@@ -1,326 +1,212 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-select-lowering-strategy)))' --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' --split-input-file %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matvec_static {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "x86_64-unknown-linux-gnu"
- }>) {
- hal.executable.export @matvec_static layout(#pipeline_layout)
- builtin.module {
- func.func @matvec_static() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x384xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<384xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x384xf32>> -> tensor<128x384xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor<readonly:tensor<384xf32>> -> tensor<384xf32>
- %5 = tensor.empty() : tensor<128xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128xf32>) -> tensor<128xf32>
- %7 = linalg.matvec ins(%3, %4 : tensor<128x384xf32>, tensor<384xf32>) outs(%6 : tensor<128xf32>) -> tensor<128xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [128], strides = [1] : tensor<128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+module {
+ func.func @matvec_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x384xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<384xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x384xf32>> -> tensor<128x384xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor<readonly:tensor<384xf32>> -> tensor<384xf32>
+ %5 = tensor.empty() : tensor<128xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128xf32>) -> tensor<128xf32>
+ %7 = linalg.matvec ins(%3, %4 : tensor<128x384xf32>, tensor<384xf32>) outs(%6 : tensor<128xf32>) -> tensor<128xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [128], strides = [1] : tensor<128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 0], [64, 0], [0, 0], [32, 0], [0, 16], [0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @matvec_static
+// CHECK: func.func @matvec_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matvec
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matvec_dynamic {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "x86_64-unknown-linux-gnu"
- }>) {
- hal.executable.export @matvec_dynamic layout(#pipeline_layout)
- builtin.module {
- func.func @matvec_dynamic() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = arith.index_cast %0 : i32 to index
- %4 = arith.index_cast %1 : i32 to index
- %5 = arith.index_cast %2 : i32 to index
- %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%3, %4}
- %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%5}
- %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%3}
- %9 = hal.interface.constant.load[0] : i32
- %10 = arith.index_cast %9 : i32 to index
- %11 = flow.dispatch.tensor.load %8, offsets = [0], sizes = [%10], strides = [1] : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%3} -> tensor<?xf32>
- %12 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%3, %4], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%3, %4} -> tensor<?x?xf32>
- %13 = flow.dispatch.tensor.load %7, offsets = [0], sizes = [%5], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%5} -> tensor<?xf32>
- %14 = linalg.fill ins(%cst : f32) outs(%11 : tensor<?xf32>) -> tensor<?xf32>
- %15 = linalg.matvec ins(%12, %13 : tensor<?x?xf32>, tensor<?xf32>) outs(%14 : tensor<?xf32>) -> tensor<?xf32>
- flow.dispatch.tensor.store %15, %8, offsets = [0], sizes = [%3], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%3}
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+module {
+ func.func @matvec_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = arith.index_cast %0 : i32 to index
+ %4 = arith.index_cast %1 : i32 to index
+ %5 = arith.index_cast %2 : i32 to index
+ %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%3, %4}
+ %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%5}
+ %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%3}
+ %9 = hal.interface.constant.load[0] : i32
+ %10 = arith.index_cast %9 : i32 to index
+ %11 = flow.dispatch.tensor.load %8, offsets = [0], sizes = [%10], strides = [1] : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%3} -> tensor<?xf32>
+ %12 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%3, %4], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%3, %4} -> tensor<?x?xf32>
+ %13 = flow.dispatch.tensor.load %7, offsets = [0], sizes = [%5], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%5} -> tensor<?xf32>
+ %14 = linalg.fill ins(%cst : f32) outs(%11 : tensor<?xf32>) -> tensor<?xf32>
+ %15 = linalg.matvec ins(%12, %13 : tensor<?x?xf32>, tensor<?xf32>) outs(%14 : tensor<?xf32>) -> tensor<?xf32>
+ flow.dispatch.tensor.store %15, %8, offsets = [0], sizes = [%3], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%3}
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 0], [64, 0], [0, 0], [32, 0], [0, 16], [0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @matvec_dynamic
+// CHECK: func.func @matvec_dynamic()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matvec
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @dot_static {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "x86_64-unknown-linux-gnu"
- }>) {
- hal.executable.export @dot_static layout(#pipeline_layout)
- builtin.module {
- func.func @dot_static() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<384xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<384xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<f32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor<readonly:tensor<384xf32>> -> tensor<384xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor<readonly:tensor<384xf32>> -> tensor<384xf32>
- %5 = tensor.empty() : tensor<f32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<f32>) -> tensor<f32>
- %7 = linalg.dot ins(%3, %4 : tensor<384xf32>, tensor<384xf32>) outs(%6 : tensor<f32>) -> tensor<f32>
- flow.dispatch.tensor.store %7, %2, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+module {
+ func.func @dot_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<384xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<384xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<f32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor<readonly:tensor<384xf32>> -> tensor<384xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor<readonly:tensor<384xf32>> -> tensor<384xf32>
+ %5 = tensor.empty() : tensor<f32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<f32>) -> tensor<f32>
+ %7 = linalg.dot ins(%3, %4 : tensor<384xf32>, tensor<384xf32>) outs(%6 : tensor<f32>) -> tensor<f32>
+ flow.dispatch.tensor.store %7, %2, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0], [0], [0], [0], [16], [0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @dot_static
+// CHECK: func.func @dot_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.dot
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @dot_dynamic {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "x86_64-unknown-linux-gnu"
- }>) {
- hal.executable.export @dot_dynamic layout(#pipeline_layout)
- builtin.module {
- func.func @dot_dynamic() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = arith.index_cast %0 : i32 to index
- %3 = arith.index_cast %1 : i32 to index
- %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<f32>>
- %5 = flow.dispatch.tensor.load %4, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<writeonly:tensor<f32>> -> tensor<f32>
- %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%2}
- %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%3}
- %8 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [%2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%2} -> tensor<?xf32>
- %9 = flow.dispatch.tensor.load %7, offsets = [0], sizes = [%3], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%3} -> tensor<?xf32>
- %10 = linalg.fill ins(%cst : f32) outs(%5 : tensor<f32>) -> tensor<f32>
- %11 = linalg.dot ins(%8, %9 : tensor<?xf32>, tensor<?xf32>) outs(%10 : tensor<f32>) -> tensor<f32>
- flow.dispatch.tensor.store %11, %4, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+module {
+ func.func @dot_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.index_cast %0 : i32 to index
+ %3 = arith.index_cast %1 : i32 to index
+ %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<f32>>
+ %5 = flow.dispatch.tensor.load %4, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<writeonly:tensor<f32>> -> tensor<f32>
+ %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%2}
+ %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%3}
+ %8 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [%2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%2} -> tensor<?xf32>
+ %9 = flow.dispatch.tensor.load %7, offsets = [0], sizes = [%3], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%3} -> tensor<?xf32>
+ %10 = linalg.fill ins(%cst : f32) outs(%5 : tensor<f32>) -> tensor<f32>
+ %11 = linalg.dot ins(%8, %9 : tensor<?xf32>, tensor<?xf32>) outs(%10 : tensor<f32>) -> tensor<f32>
+ flow.dispatch.tensor.store %11, %4, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0], [0], [0], [0], [16], [0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @dot_dynamic
+// CHECK: func.func @dot_dynamic()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.dot
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @dynamic_add {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "x86_64-unknown-linux-gnu"
- }>) {
- hal.executable.export @dynamic_add layout(#pipeline_layout)
- builtin.module {
- func.func @dynamic_add() {
- %c0 = arith.constant 0 : index
- %dim0 = hal.interface.constant.load[0] : index
- %dim1 = hal.interface.constant.load[1] : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%dim0, %dim1}
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%dim1}
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%dim0, %dim1}
- %3 = flow.dispatch.tensor.load %0, offsets=[0, 0], sizes=[%dim0, %dim1], strides=[1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%dim0, %dim1} -> tensor<?x?xf32>
- %4 = flow.dispatch.tensor.load %1, offsets=[0], sizes=[%dim1], strides=[1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%dim1} -> tensor<?xf32>
- %5 = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
- %6 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d1)>,
- affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%3, %4 : tensor<?x?xf32>, tensor<?xf32>) outs(%5 : tensor<?x?xf32>) {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
- %7 = arith.addf %arg0, %arg1 : f32
- linalg.yield %7 : f32
- } -> tensor<?x?xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%dim0, %dim1], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%dim0, %dim1}
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d1)>
+module {
+ func.func @dynamic_add() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
+ %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%1}
+ %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
+ %6 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [%1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%1} -> tensor<?xf32>
+ %7 = tensor.empty(%0, %1) : tensor<?x?xf32>
+ %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %6 : tensor<?x?xf32>, tensor<?xf32>) outs(%7 : tensor<?x?xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %9 = arith.addf %in, %in_0 : f32
+ linalg.yield %9 : f32
+ } -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [1, 4], [0, 0], [0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @dynamic_add
+// CHECK: func.func @dynamic_add()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable private @add4D {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "x86_64-unknown-linux-gnu"
- }>) {
- hal.executable.export @add4D layout(#pipeline_layout)
- builtin.module {
- func.func @add4D() {
- %d0 = hal.interface.constant.load[0] : index
- %d1 = hal.interface.constant.load[1] : index
- %d2 = hal.interface.constant.load[2] : index
- %d3 = hal.interface.constant.load[3] : index
- %arg1_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32)
- : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%d0, %d1, %d2, %d3}
- %arg2_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32)
- : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%d0, %d1, %d2, %d3}
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32)
- : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xf32>>{%d0, %d1, %d2, %d3}
- %arg1 = flow.dispatch.tensor.load %arg1_binding, offsets = [0, 0, 0, 0], sizes = [%d0, %d1, %d2, %d3], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%d0, %d1, %d2, %d3} -> tensor<?x?x?x?xf32>
- %arg2 = flow.dispatch.tensor.load %arg2_binding, offsets = [0, 0, 0, 0], sizes = [%d0, %d1, %d2, %d3], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%d0, %d1, %d2, %d3} -> tensor<?x?x?x?xf32>
- %init = tensor.empty(%d0, %d1, %d2, %d3) : tensor<?x?x?x?xf32>
- %add = linalg.generic {
- indexing_maps = [#map, #map, #map],
- iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
- ins(%arg1, %arg2 : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) outs(%init : tensor<?x?x?x?xf32>) {
- ^bb0(%b0: f32, %b1: f32, %b2: f32): // no predecessors
- %addf = arith.addf %b0, %b1 : f32
- linalg.yield %addf : f32
- } -> tensor<?x?x?x?xf32>
- flow.dispatch.tensor.store %add, %result_binding, offsets = [0, 0, 0, 0], sizes = [%d0, %d1, %d2, %d3], strides = [1, 1, 1, 1]
- : tensor<?x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xf32>>{%d0, %d1, %d2, %d3}
- return
- }
- }
+module {
+ func.func @add4D() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.constant.load[3] : index
+ %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%0, %1, %2, %3}
+ %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%0, %1, %2, %3}
+ %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xf32>>{%0, %1, %2, %3}
+ %7 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%0, %1, %2, %3} -> tensor<?x?x?x?xf32>
+ %8 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%0, %1, %2, %3} -> tensor<?x?x?x?xf32>
+ %9 = tensor.empty(%0, %1, %2, %3) : tensor<?x?x?x?xf32>
+ %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7, %8 : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) outs(%9 : tensor<?x?x?x?xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %11 = arith.addf %in, %in_0 : f32
+ linalg.yield %11 : f32
+ } -> tensor<?x?x?x?xf32>
+ flow.dispatch.tensor.store %10, %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : tensor<?x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xf32>>{%0, %1, %2, %3}
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 64, 64, 64], [1, 1, 1, 4], [0, 0, 0, 0], [0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @add4D
+// CHECK: func.func @add4D()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @add_static {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "x86_64-unknown-linux-gnu"
- }>) {
- hal.executable.export @add_static layout(#pipeline_layout)
- builtin.module {
- func.func @add_static() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64x16x32x128xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x16x32x128xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [64, 16, 32, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x16x32x128xf32>> -> tensor<64x16x32x128xf32>
- %3 = tensor.empty() : tensor<64x16x32x128xf32>
- %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<64x16x32x128xf32>) outs(%3 : tensor<64x16x32x128xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %5 = arith.addf %arg0, %arg0 : f32
- linalg.yield %5 : f32
- } -> tensor<64x16x32x128xf32>
- flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [64, 16, 32, 128], strides = [1, 1, 1, 1] : tensor<64x16x32x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x16x32x128xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+module {
+ func.func @add_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64x16x32x128xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x16x32x128xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [64, 16, 32, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x16x32x128xf32>> -> tensor<64x16x32x128xf32>
+ %3 = tensor.empty() : tensor<64x16x32x128xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<64x16x32x128xf32>) outs(%3 : tensor<64x16x32x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %5 = arith.addf %in, %in : f32
+ linalg.yield %5 : f32
+ } -> tensor<64x16x32x128xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [64, 16, 32, 128], strides = [1, 1, 1, 1] : tensor<64x16x32x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x16x32x128xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 8, 16, 64], [1, 1, 1, 4], [0, 0, 0, 0], [0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @add_static
+// CHECK: func.func @add_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -329,297 +215,179 @@
#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [64, 64, 0], [0, 0, 0], [32, 32, 0], [0, 0, 32], [0, 0, 0]]>
#translation = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @preset_config_matmul_tensors {
- hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-unknown-linux-gnu",
- native_vector_size = 16 : index
- }>) {
- hal.executable.export @preset_config_matmul_tensors layout(#pipeline_layout) attributes {translation_info = #translation}
- builtin.module {
- func.func @preset_config_matmul_tensors() {
- %cst = arith.constant 0.000000e+00 : f32
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<256x512xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [128, 256], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [256, 512], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x512xf32>> -> tensor<256x512xf32>
- %init = tensor.empty() : tensor<128x512xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<128x512xf32>) -> tensor<128x512xf32>
- %gemm = linalg.matmul {lowering_config = #config}
- ins(%lhs, %rhs : tensor<128x256xf32>, tensor<256x512xf32>)
- outs(%fill : tensor<128x512xf32>) -> tensor<128x512xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [128, 512], strides = [1, 1]
- : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+module {
+ func.func @preset_config_matmul_tensors() attributes {
+ hal.executable.target = #executable_target_system_elf_x86_64_,
+ translation_info = #translation
+ } {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x512xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x512xf32>> -> tensor<256x512xf32>
+ %5 = tensor.empty() : tensor<128x512xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x512xf32>) -> tensor<128x512xf32>
+ %7 = linalg.matmul {lowering_config = #config} ins(%3, %4 : tensor<128x256xf32>, tensor<256x512xf32>) outs(%6 : tensor<128x512xf32>) -> tensor<128x512xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [64, 64, 0], [0, 0, 0], [32, 32, 0], [0, 0, 32], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @preset_config_matmul_tensors
+// CHECK: func.func @preset_config_matmul_tensors()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK: func.func @preset_config
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_partially_peel {
- hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-unknown-linux-gnu",
- native_vector_size = 16 : index
- }>) {
- hal.executable.export @matmul_partially_peel layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_partially_peel() {
- %cst = arith.constant 0.000000e+00 : f32
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<16641x16xf32>>
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<16x8xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<16641x8xf32>>
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [16641, 16], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<16641x16xf32>> -> tensor<16641x16xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [16, 8], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<16x8xf32>> -> tensor<16x8xf32>
- %init = tensor.empty() : tensor<16641x8xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<16641x8xf32>) -> tensor<16641x8xf32>
- %gemm = linalg.matmul
- ins(%lhs, %rhs : tensor<16641x16xf32>, tensor<16x8xf32>)
- outs(%fill : tensor<16641x8xf32>) -> tensor<16641x8xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [16641, 8], strides = [1, 1]
- : tensor<16641x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<16641x8xf32>>
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+module {
+ func.func @matmul_partially_peel() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<16641x16xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<16x8xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<16641x8xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [16641, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<16641x16xf32>> -> tensor<16641x16xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<16x8xf32>> -> tensor<16x8xf32>
+ %5 = tensor.empty() : tensor<16641x8xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<16641x8xf32>) -> tensor<16641x8xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<16641x16xf32>, tensor<16x8xf32>) outs(%6 : tensor<16641x8xf32>) -> tensor<16641x8xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [16641, 8], strides = [1, 1] : tensor<16641x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<16641x8xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[43, 8, 0], [43, 8, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @matmul_partially_peel
+// CHECK: func.func @matmul_partially_peel()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK: func.func @matmul_partially_peel
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @copy_op_dynamic {
- hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-unknown-linux-gnu",
- native_vector_size = 16 : index
- }>) {
- hal.executable.export @copy_op_dynamic layout(#pipeline_layout)
- builtin.module {
- func.func @copy_op_dynamic() {
- %d0 = hal.interface.constant.load[0] : index
- %d1 = hal.interface.constant.load[1] : index
- %d2 = hal.interface.constant.load[2] : index
- %d3 = hal.interface.constant.load[3] : index
- %o0 = hal.interface.constant.load[4] : index
- %o1 = hal.interface.constant.load[5] : index
- %source = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<?x?xi32>{%d0, %d1}
- %dest = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<?x?xi32>{%d2, %d3}
- %dest_view = memref.subview %dest[%o0, %o1] [%d0, %d1] [1, 1] : memref<?x?xi32> to memref<?x?xi32, strided<[?, 1], offset : ?>>
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)> , affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%source : memref<?x?xi32>) outs(%dest_view : memref<?x?xi32, strided<[?, 1], offset : ?>>) {
- ^bb0(%arg0 : i32, %arg1 : i32):
- linalg.yield %arg0 : i32
- }
- return
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @copy_op_dynamic() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} {
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.constant.load[3] : index
+ %4 = hal.interface.constant.load[4] : index
+ %5 = hal.interface.constant.load[5] : index
+ %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<?x?xi32>{%0, %1}
+ %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<?x?xi32>{%2, %3}
+ %subview = memref.subview %7[%4, %5] [%0, %1] [1, 1] : memref<?x?xi32> to memref<?x?xi32, strided<[?, 1], offset: ?>>
+ linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%6 : memref<?x?xi32>) outs(%subview : memref<?x?xi32, strided<[?, 1], offset: ?>>) {
+ ^bb0(%in: i32, %out: i32):
+ linalg.yield %in : i32
}
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [1, 4], [0, 0], [0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize>
-// CHECK: hal.executable.export public @copy_op_dynamic
+// CHECK: func.func @copy_op_dynamic()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @static_1d_fft_stage2 {
- hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-unknown-linux-gnu",
- native_vector_size = 16 : index
- }>) {
- hal.executable.export @static_1d_fft_stage2 layout(#pipeline_layout)
- builtin.module {
- func.func @static_1d_fft_stage2() {
- %c0 = arith.constant 0 : index
- %c2 = arith.constant 2 : index
- %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32>
- %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32>
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
- %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
- %4:2 = iree_linalg_ext.fft ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32>
- flow.dispatch.tensor.store %4#0, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- flow.dispatch.tensor.store %4#1, %1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+module {
+ func.func @static_1d_fft_stage2() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %c2 = arith.constant 2 : index
+ %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32>
+ %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32>
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
+ %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
+ %4:2 = iree_linalg_ext.fft ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32>
+ flow.dispatch.tensor.store %4#0, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ flow.dispatch.tensor.store %4#1, %1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64]{{\]}}>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDefault>
-// CHECK: hal.executable.export public @static_1d_fft_stage2
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: func.func @static_1d_fft_stage2()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: iree_linalg_ext.fft
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @static_3d_fft_stage3 {
- hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-unknown-linux-gnu",
- native_vector_size = 16 : index
- }>) {
- hal.executable.export @static_3d_fft_stage3 layout(#pipeline_layout)
- builtin.module {
- func.func @static_3d_fft_stage3() {
- %c3 = arith.constant 3 : index
- %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32>
- %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32>
- %0 = bufferization.to_memref %cst_0 : memref<4xf32>
- %1 = bufferization.to_memref %cst : memref<4xf32>
- %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x128x32xf32>
- %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<64x128x32xf32>
- iree_linalg_ext.fft
- ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>)
- outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>)
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+module {
+ func.func @static_3d_fft_stage3() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} {
+ %c3 = arith.constant 3 : index
+ %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32>
+ %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32>
+ %0 = bufferization.to_memref %cst_0 : memref<4xf32>
+ %1 = bufferization.to_memref %cst : memref<4xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x128x32xf32>
+ %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<64x128x32xf32>
+ iree_linalg_ext.fft ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>) outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>)
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 64, 64]{{\]}}>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDefault>
-// CHECK: hal.executable.export public @static_3d_fft_stage3
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: func.func @static_3d_fft_stage3()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: iree_linalg_ext.fft
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @outs_fusion {
- hal.executable.variant @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-unknown-linux-gnu",
- native_vector_size = 16 : index
- }>) {
- hal.executable.export @outs_fusion_fn layout(#pipeline_layout)
- builtin.module {
- func.func @outs_fusion_fn() {
- %cst = arith.constant 0.0 : f32
- %d0 = hal.interface.constant.load[0] : index
- %d1 = hal.interface.constant.load[1] : index
- %d2 = hal.interface.constant.load[2] : index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%d0, %d2}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%d2, %d1}
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%d0, %d1}
- %init = tensor.empty(%d0, %d1) : tensor<?x?xf32>
- %fill = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]}
- outs(%init : tensor<?x?xf32>) {
- ^bb0(%b0: f32):
- linalg.yield %cst : f32
- } -> tensor<?x?xf32>
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%d0, %d2], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%d0, %d2} -> tensor<?x?xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%d2, %d1], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%d2, %d1} -> tensor<?x?xf32>
- %gemm = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
- affine_map<(d0, d1, d2) -> (d2, d1)>,
- affine_map<(d0, d1, d2) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel", "reduction"]}
- ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
- outs(%fill : tensor<?x?xf32>) {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
- %6 = arith.mulf %arg0, %arg1 : f32
- %7 = arith.addf %6, %arg2 : f32
- linalg.yield %6 : f32
- } -> tensor<?x?xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [%d0, %d1], strides = [1, 1]
- : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%d0, %d1}
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module {
+ func.func @outs_fusion_fn() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2}
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1}
+ %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ %6 = tensor.empty(%0, %1) : tensor<?x?xf32>
+ %7 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel"]} outs(%6 : tensor<?x?xf32>) {
+ ^bb0(%out: f32):
+ linalg.yield %cst : f32
+ } -> tensor<?x?xf32>
+ %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %2} -> tensor<?x?xf32>
+ %9 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%2, %1} -> tensor<?x?xf32>
+ %10 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"]} ins(%8, %9 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%7 : tensor<?x?xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %11 = arith.mulf %in, %in_0 : f32
+ %12 = arith.addf %11, %out : f32
+ linalg.yield %11 : f32
+ } -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %10, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
+ return
}
}
// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32], [1, 4], [0, 0], [0, 0]]>
// CHECK-DAG: #[[CONFIG2:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32, 0], [1, 4, 0], [0, 0, 4], [0, 0, 0]]>
// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @outs_fusion_fn
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: func.func @outs_fusion_fn()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG1]]
// CHECK: linalg.generic
@@ -627,836 +395,499 @@
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @conv_dynamic {
- hal.executable.variant public @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-unknown-linux-gnu",
- native_vector_size = 16 : index
- }>) {
- hal.executable.export public @conv_dynamic layout(#pipeline_layout)
- builtin.module {
- func.func @conv_dynamic() {
- %N = hal.interface.constant.load[0] : index
- %H = hal.interface.constant.load[1] : index
- %W = hal.interface.constant.load[2] : index
- %C = hal.interface.constant.load[3] : index
- %R = hal.interface.constant.load[4] : index
- %S = hal.interface.constant.load[5] : index
- %F = hal.interface.constant.load[6] : index
- %P = hal.interface.constant.load[7] : index
- %Q = hal.interface.constant.load[8] : index
- %input_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%N, %H, %W, %C}
- %filter_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%R, %S, %C, %F}
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<readwrite:tensor<?x?x?x?xf32>>{%N, %P, %Q, %F}
- %input = flow.dispatch.tensor.load %input_binding, offsets = [0, 0, 0, 0], sizes = [%N, %H, %W, %C], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%N, %H, %W, %C} -> tensor<?x?x?x?xf32>
- %filter = flow.dispatch.tensor.load %filter_binding, offsets = [0, 0, 0, 0], sizes = [%R, %S, %C, %F], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%R, %S, %C, %F} -> tensor<?x?x?x?xf32>
- %init = flow.dispatch.tensor.load %result_binding, offsets = [0, 0, 0, 0], sizes = [%N, %P, %Q, %F], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readwrite:tensor<?x?x?x?xf32>>{%N, %P, %Q, %F} -> tensor<?x?x?x?xf32>
- %conv = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
- ins(%input, %filter : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
- outs(%init : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
- flow.dispatch.tensor.store %conv, %result_binding, offsets = [0, 0, 0, 0], sizes = [%N, %P, %Q, %F], strides = [1, 1, 1, 1]
- : tensor<?x?x?x?xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x?x?xf32>>{%N, %P, %Q, %F}
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+module {
+ func.func @conv_dynamic() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} {
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.constant.load[3] : index
+ %4 = hal.interface.constant.load[4] : index
+ %5 = hal.interface.constant.load[5] : index
+ %6 = hal.interface.constant.load[6] : index
+ %7 = hal.interface.constant.load[7] : index
+ %8 = hal.interface.constant.load[8] : index
+ %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%0, %1, %2, %3}
+ %10 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %5, %3, %6}
+ %11 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<?x?x?x?xf32>>{%0, %7, %8, %6}
+ %12 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%0, %1, %2, %3} -> tensor<?x?x?x?xf32>
+ %13 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0, 0], sizes = [%4, %5, %3, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%4, %5, %3, %6} -> tensor<?x?x?x?xf32>
+ %14 = flow.dispatch.tensor.load %11, offsets = [0, 0, 0, 0], sizes = [%0, %7, %8, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x?x?xf32>>{%0, %7, %8, %6} -> tensor<?x?x?x?xf32>
+ %15 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%12, %13 : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) outs(%14 : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+ flow.dispatch.tensor.store %15, %11, offsets = [0, 0, 0, 0], sizes = [%0, %7, %8, %6], strides = [1, 1, 1, 1] : tensor<?x?x?x?xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x?x?xf32>>{%0, %7, %8, %6}
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 64, 64, 64, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
-// CHECK: hal.executable.export public @conv_dynamic
+// CHECK: func.func @conv_dynamic()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.conv_2d_nhwc_hwcf
// CHECK: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @conv_static {
- hal.executable.variant public @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-unknown-linux-gnu",
- native_vector_size = 16 : index
- }>) {
- hal.executable.export public @conv_static layout(#pipeline_layout)
- builtin.module {
- func.func @conv_static() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %c607520 = arith.constant 607520 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c607520) : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>> -> tensor<1x225x225x3xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>> -> tensor<3x3x3x16xf32>
- %5 = tensor.empty() : tensor<1x112x112x16xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
- %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 16], strides = [1, 1, 1, 1] : tensor<1x112x112x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+module {
+ func.func @conv_static() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %c607520 = arith.constant 607520 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c607520) : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>> -> tensor<1x225x225x3xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>> -> tensor<3x3x3x16xf32>
+ %5 = tensor.empty() : tensor<1x112x112x16xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
+ %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 16], strides = [1, 1, 1, 1] : tensor<1x112x112x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 28, 28, 16, 0, 0, 0], [1, 1, 4, 4, 0, 0, 0], [0, 0, 0, 0, 1, 1, 3], [0, 0, 0, 0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
-// CHECK: hal.executable.export public @conv_static
+// CHECK: func.func @conv_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.conv_2d_nhwc_hwcf
// -----
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @conv_nchw_static {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @conv_nchw_static ordinal(0) layout(#pipeline_layout)
- builtin.module {
- func.func @conv_nchw_static() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x128x30x30xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x128x3x3xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x128x28x28xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 128, 30, 30], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x128x30x30xf32>> -> tensor<1x128x30x30xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [128, 128, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128x3x3xf32>> -> tensor<128x128x3x3xf32>
- %5 = tensor.empty() : tensor<1x128x28x28xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
- %7 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%3, %4 : tensor<1x128x30x30xf32>, tensor<128x128x3x3xf32>) outs(%6 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 128, 28, 28], strides = [1, 1, 1, 1] : tensor<1x128x28x28xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x128x28x28xf32>>
- return
- }
- }
+module {
+ func.func @conv_nchw_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x128x30x30xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x128x3x3xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x128x28x28xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 128, 30, 30], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x128x30x30xf32>> -> tensor<1x128x30x30xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [128, 128, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x128x3x3xf32>> -> tensor<128x128x3x3xf32>
+ %5 = tensor.empty() : tensor<1x128x28x28xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
+ %7 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%3, %4 : tensor<1x128x30x30xf32>, tensor<128x128x3x3xf32>) outs(%6 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 128, 28, 28], strides = [1, 1, 1, 1] : tensor<1x128x28x28xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x128x28x28xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 64, 28, 4, 0, 0, 0], [1, 4, 1, 4, 0, 0, 0], [0, 0, 0, 0, 8, 1, 1], [0, 0, 0, 0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
-// CHECK: hal.executable.export public @conv_nchw_static
+// CHECK: func.func @conv_nchw_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.conv_2d_nchw_fchw
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @depthwise_conv_static {
- hal.executable.variant public @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-unknown-linux-gnu",
- native_vector_size = 64 : index
- }>) {
- hal.executable.export public @depthwise_conv_static layout(#pipeline_layout)
- builtin.module {
- func.func @depthwise_conv_static() {
- %cst = arith.constant 0.0 : f32
- %input_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<1x161x161x240xf32>>
- %filter_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<3x3x240xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<1x80x80x240xf32>>
- %input = flow.dispatch.tensor.load %input_binding, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 240], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x161x161x240xf32>> -> tensor<1x161x161x240xf32>
- %filter = flow.dispatch.tensor.load %filter_binding, offsets = [0, 0, 0], sizes = [3, 3, 240], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x240xf32>> -> tensor<3x3x240xf32>
- %init = tensor.empty() : tensor<1x80x80x240xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x80x80x240xf32>) -> tensor<1x80x80x240xf32>
- %conv = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%input, %filter : tensor<1x161x161x240xf32>, tensor<3x3x240xf32>) outs(%fill : tensor<1x80x80x240xf32>) -> tensor<1x80x80x240xf32>
- flow.dispatch.tensor.store %conv, %result_binding, offsets = [0, 0, 0, 0], sizes = [1, 80, 80, 240], strides = [1, 1, 1, 1]
- : tensor<1x80x80x240xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x80x80x240xf32>>
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+module {
+ func.func @depthwise_conv_static() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x161x161x240xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x240xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x80x80x240xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 240], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x161x161x240xf32>> -> tensor<1x161x161x240xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 240], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x240xf32>> -> tensor<3x3x240xf32>
+ %5 = tensor.empty() : tensor<1x80x80x240xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x80x80x240xf32>) -> tensor<1x80x80x240xf32>
+ %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x161x161x240xf32>, tensor<3x3x240xf32>) outs(%6 : tensor<1x80x80x240xf32>) -> tensor<1x80x80x240xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 80, 80, 240], strides = [1, 1, 1, 1] : tensor<1x80x80x240xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x80x80x240xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 40, 40, 48, 0, 0], [1, 1, 8, 16, 0, 0], [0, 0, 0, 0, 1, 3], [0, 0, 0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
-// CHECK: hal.executable.export public @depthwise_conv_static
+// CHECK: func.func @depthwise_conv_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.depthwise_conv_2d_nhwc_hwc
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @thin_depthwise_conv_static {
- hal.executable.variant public @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-unknown-linux-gnu",
- native_vector_size = 64 : index
- }>) {
- hal.executable.export public @thin_depthwise_conv_static layout(#pipeline_layout)
- builtin.module {
- func.func @thin_depthwise_conv_static() {
- %cst = arith.constant 0.0 : f32
- %input_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<1x57x57x72xf32>>
- %filter_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<3x3x72xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<1x28x28x72xf32>>
- %input = flow.dispatch.tensor.load %input_binding, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 240], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x57x57x72xf32>> -> tensor<1x57x57x72xf32>
- %filter = flow.dispatch.tensor.load %filter_binding, offsets = [0, 0, 0], sizes = [3, 3, 240], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x72xf32>> -> tensor<3x3x72xf32>
- %init = tensor.empty() : tensor<1x28x28x72xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32>
- %conv = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%input, %filter : tensor<1x57x57x72xf32>, tensor<3x3x72xf32>)
- outs(%fill : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32>
-
- flow.dispatch.tensor.store %conv, %result_binding, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 72], strides = [1, 1, 1, 1]
- : tensor<1x28x28x72xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x28x28x72xf32>>
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+module {
+ func.func @thin_depthwise_conv_static() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x57x57x72xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x72xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x28x28x72xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 240], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x57x57x72xf32>> -> tensor<1x57x57x72xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 240], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x72xf32>> -> tensor<3x3x72xf32>
+ %5 = tensor.empty() : tensor<1x28x28x72xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32>
+ %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x57x57x72xf32>, tensor<3x3x72xf32>) outs(%6 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 72], strides = [1, 1, 1, 1] : tensor<1x28x28x72xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x28x28x72xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 7, 14, 36, 0, 0], [1, 1, 7, 12, 0, 0], [0, 0, 0, 0, 1, 3], [0, 0, 0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
-// CHECK: hal.executable.export public @thin_depthwise_conv_static
+// CHECK: func.func @thin_depthwise_conv_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.depthwise_conv_2d_nhwc_hwc
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-hal.executable private @pooling_nchw_max {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vnni,+adx,+clflushopt,+clwb,+cx16,+cx8,+crc32,+f16c,+fsgsbase,+fxsr,+invpcid,+lzcnt,+movbe,+pku,+prfchw,+rdrnd,+rdseed,+sahf,+x87,+xsave,+xsavec,+xsaveopt,+xsaves", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf", ukernels = false}>) {
- hal.executable.export public @pooling_nchw_max ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @pooling_nchw_max() {
- %c3846080 = arith.constant 3846080 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant -3.40282347E+38 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c3846080) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x64x114x114xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x64x56x56xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 64, 114, 114], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x64x114x114xf32>> -> tensor<1x64x114x114xf32>
- %3 = tensor.empty() : tensor<1x64x56x56xf32>
- %4 = tensor.empty() : tensor<3x3xf32>
- %5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
- %6 = linalg.pooling_nchw_max {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%2, %4 : tensor<1x64x114x114xf32>, tensor<3x3xf32>) outs(%5 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
- flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0], sizes = [1, 64, 56, 56], strides = [1, 1, 1, 1] : tensor<1x64x56x56xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x64x56x56xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vnni,+adx,+clflushopt,+clwb,+cx16,+cx8,+crc32,+f16c,+fsgsbase,+fxsr,+invpcid,+lzcnt,+movbe,+pku,+prfchw,+rdrnd,+rdseed,+sahf,+x87,+xsave,+xsavec,+xsaveopt,+xsaves", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf", ukernels = false}>
+module {
+ func.func @pooling_nchw_max() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c3846080 = arith.constant 3846080 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant -3.40282347E+38 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c3846080) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x64x114x114xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x64x56x56xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 64, 114, 114], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x64x114x114xf32>> -> tensor<1x64x114x114xf32>
+ %3 = tensor.empty() : tensor<1x64x56x56xf32>
+ %4 = tensor.empty() : tensor<3x3xf32>
+ %5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
+ %6 = linalg.pooling_nchw_max {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%2, %4 : tensor<1x64x114x114xf32>, tensor<3x3xf32>) outs(%5 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
+ flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0], sizes = [1, 64, 56, 56], strides = [1, 1, 1, 1] : tensor<1x64x56x56xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x64x56x56xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 32, 56, 8, 0, 0], [1, 8, 1, 8, 0, 0], [0, 0, 0, 0, 1, 3], [0, 0, 0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
-// CHECK: hal.executable.export public @pooling_nchw_max
+// CHECK: func.func @pooling_nchw_max()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.pooling_nchw_max
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @generic_static {
- hal.executable.variant public @system_elf_x86_64 target(<"llvm-cpu", "system-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-pc-linux-gnu",
- native_vector_size = 64 : index
- }>) {
- hal.executable.export public @generic_static layout(#pipeline_layout)
- builtin.module {
- func.func @generic_static() {
- %input_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<96x16xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<16x96xf32>>
- %input = flow.dispatch.tensor.load %input_binding, offsets = [0, 0], sizes = [96, 16], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<96x16xf32>> -> tensor<96x16xf32>
- %init = tensor.empty() : tensor<16x96xf32>
- %result = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%input : tensor<96x16xf32>) outs(%init : tensor<16x96xf32>) {
- ^bb0(%b0: f32, %b1: f32): // no predecessors
- linalg.yield %b0 : f32
- } -> tensor<16x96xf32>
- flow.dispatch.tensor.store %result, %result_binding, offsets = [0, 0], sizes = [16, 96], strides = [1, 1]
- : tensor<16x96xf32> -> !flow.dispatch.tensor<writeonly:tensor<16x96xf32>>
- return
- }
- }
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-pc-linux-gnu"}>
+#map = affine_map<(d0, d1) -> (d1, d0)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @generic_static() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} {
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<96x16xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<16x96xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [96, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<96x16xf32>> -> tensor<96x16xf32>
+ %3 = tensor.empty() : tensor<16x96xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<96x16xf32>) outs(%3 : tensor<16x96xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<16x96xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [16, 96], strides = [1, 1] : tensor<16x96xf32> -> !flow.dispatch.tensor<writeonly:tensor<16x96xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 96], [16, 16], [0, 0], [0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @generic_static
+// CHECK: func.func @generic_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_static {
- hal.executable.variant public @embedded_elf_x86_64 target(#hal.executable.target<
- "llvm-cpu",
- "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-none-elf",
- native_vector_size = 16 : index
- }>) {
- hal.executable.export public @matmul_static layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_static() {
- %cst = arith.constant 0.0 : f32
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<384x512xf32>>
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x128xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [384, 512], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<384x512xf32>> -> tensor<384x512xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [512, 128], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<512x128xf32>> -> tensor<512x128xf32>
- %init = tensor.empty() : tensor<384x128xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<384x128xf32>) -> tensor<384x128xf32>
- %gemm = linalg.matmul ins(%lhs, %rhs : tensor<384x512xf32>, tensor<512x128xf32>)
- outs(%fill : tensor<384x128xf32>) -> tensor<384x128xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [384, 128], strides = [1, 1]
- : tensor<384x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @matmul_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<384x512xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x128xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x512xf32>> -> tensor<384x512xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x128xf32>> -> tensor<512x128xf32>
+ %5 = tensor.empty() : tensor<384x128xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<384x128xf32>) -> tensor<384x128xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<384x512xf32>, tensor<512x128xf32>) outs(%6 : tensor<384x128xf32>) -> tensor<384x128xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [64, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @matmul_static
+// CHECK: func.func @matmul_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 4, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
- "llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-none-elf",
- native_vector_size = 16 : index
- }
->
-hal.executable private @reduction {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @predict_dispatch_86 ordinal(0) layout(#pipeline_layout)
- builtin.module {
- func.func @predict_dispatch_86(%arg0: !flow.dispatch.tensor<readonly:tensor<7x7x2048xf32>>,
- %arg1: !flow.dispatch.tensor<writeonly:tensor<7xf32>>) {
- %cst = arith.constant 0.0 : f32
- %cst1 = arith.constant 10.0 : f32
- %input = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [7, 7, 2048], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<7x7x2048xf32>> -> tensor<7x7x2048xf32>
- %init = tensor.empty() : tensor<7xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<7xf32>) -> tensor<7xf32>
- %reduce = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>],
- iterator_types = ["parallel", "reduction", "reduction"]}
- ins(%input : tensor<7x7x2048xf32>) outs(%fill : tensor<7xf32>) {
- ^bb0(%b0: f32, %b1: f32):
- %addf = arith.addf %b0, %b1 : f32
- linalg.yield %addf : f32
- } -> tensor<7xf32>
- %generic = linalg.generic {
- indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
- iterator_types = ["parallel"]}
- ins(%reduce : tensor<7xf32>) outs(%init : tensor<7xf32>) {
- ^bb0(%b0: f32, %b1: f32):
- %11 = arith.divf %b0, %cst1 : f32
- linalg.yield %11 : f32
- } -> tensor<7xf32>
- flow.dispatch.tensor.store %generic, %arg1, offsets = [0], sizes = [7], strides = [1]
- : tensor<7xf32> -> !flow.dispatch.tensor<writeonly:tensor<7xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0)>
+#map2 = affine_map<(d0) -> (d0)>
+module {
+ func.func @predict_dispatch_86(%arg0: !flow.dispatch.tensor<readonly:tensor<7x7x2048xf32>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<7xf32>>) attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %cst_0 = arith.constant 1.000000e+01 : f32
+ %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0, 0], sizes = [7, 7, 2048], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<7x7x2048xf32>> -> tensor<7x7x2048xf32>
+ %1 = tensor.empty() : tensor<7xf32>
+ %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<7xf32>) -> tensor<7xf32>
+ %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction", "reduction"]} ins(%0 : tensor<7x7x2048xf32>) outs(%2 : tensor<7xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %5 = arith.addf %in, %out : f32
+ linalg.yield %5 : f32
+ } -> tensor<7xf32>
+ %4 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel"]} ins(%3 : tensor<7xf32>) outs(%1 : tensor<7xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %5 = arith.divf %in, %cst_0 : f32
+ linalg.yield %5 : f32
+ } -> tensor<7xf32>
+ flow.dispatch.tensor.store %4, %arg1, offsets = [0], sizes = [7], strides = [1] : tensor<7xf32> -> !flow.dispatch.tensor<writeonly:tensor<7xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 0, 0], [1, 0, 0], [0, 1, 4], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @predict_dispatch_86
+// CHECK: func.func @predict_dispatch_86(
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic {indexing_maps = [#{{.+}}, #{{.+}}], iterator_types = ["parallel", "reduction", "reduction"]}
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_i8_i8_i32_static {
- hal.executable.variant public @embedded_elf_x86_64 target(#hal.executable.target<
- "llvm-cpu",
- "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-none-elf",
- native_vector_size = 16 : index
- }>) {
- hal.executable.export public @matmul_i8_i8_i32_static layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_i8_i8_i32_static() {
- %c0_i32 = arith.constant 0 : i32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x384xi8>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<384x1536xi8>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x1536xi32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x384xi8>> -> tensor<128x384xi8>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 1536], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x1536xi8>> -> tensor<384x1536xi8>
- %5 = tensor.empty() : tensor<128x1536xi32>
- %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x1536xi32>) -> tensor<128x1536xi32>
- %7 = linalg.matmul ins(%3, %4 : tensor<128x384xi8>, tensor<384x1536xi8>) outs(%6 : tensor<128x1536xi32>) -> tensor<128x1536xi32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1536], strides = [1, 1] : tensor<128x1536xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x1536xi32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @matmul_i8_i8_i32_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0_i32 = arith.constant 0 : i32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x384xi8>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<384x1536xi8>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x1536xi32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x384xi8>> -> tensor<128x384xi8>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 1536], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x1536xi8>> -> tensor<384x1536xi8>
+ %5 = tensor.empty() : tensor<128x1536xi32>
+ %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x1536xi32>) -> tensor<128x1536xi32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<128x384xi8>, tensor<384x1536xi8>) outs(%6 : tensor<128x1536xi32>) -> tensor<128x1536xi32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1536], strides = [1, 1] : tensor<128x1536xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x1536xi32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [64, 64, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @matmul_i8_i8_i32_static
+// CHECK: func.func @matmul_i8_i8_i32_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 4, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
- "llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-none-elf",
- native_vector_size = 16 : index
- }
->
-#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
-#map1 = affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>
-hal.executable private @gemm_unit_N {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @gemm_unit_N ordinal(0) layout(#pipeline_layout)
- builtin.module {
- func.func @gemm_unit_N() {
- %c0 = arith.constant 0 : index
- %M = hal.interface.constant.load[0] : index
- %K = hal.interface.constant.load[1] : index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0)
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %K}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0)
- : !flow.dispatch.tensor<readonly:tensor<?x1xf32>>{%K}
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0)
- : !flow.dispatch.tensor<readwrite:tensor<?x1xf32>>{%M}
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%K, 1], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x1xf32>>{%K} -> tensor<?x1xf32>
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%M, %K} -> tensor<?x?xf32>
- %init = flow.dispatch.tensor.load %result_binding, offsets = [0, 0], sizes = [%M, 1], strides = [1, 1]
- : !flow.dispatch.tensor<readwrite:tensor<?x1xf32>>{%M} -> tensor<?x1xf32>
- %gemm = linalg.matmul ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x1xf32>) outs(%init : tensor<?x1xf32>) -> tensor<?x1xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [%M, 1], strides = [1, 1]
- : tensor<?x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x1xf32>>{%M}
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @gemm_unit_N() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
+ %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x1xf32>>{%1}
+ %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<?x1xf32>>{%0}
+ %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x1xf32>>{%1} -> tensor<?x1xf32>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
+ %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x1xf32>>{%0} -> tensor<?x1xf32>
+ %8 = linalg.matmul ins(%6, %5 : tensor<?x?xf32>, tensor<?x1xf32>) outs(%7 : tensor<?x1xf32>) -> tensor<?x1xf32>
+ flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, 1], strides = [1, 1] : tensor<?x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<?x1xf32>>{%0}
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 0, 0], [64, 0, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @gemm_unit_N
+// CHECK: func.func @gemm_unit_N()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 4, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
- "llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-none-elf",
- native_vector_size = 16 : index
- }
->
-hal.executable private @gemm_unit_M_unit_N {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @gemm_unit_M_unit_N ordinal(0) layout(#pipeline_layout)
- builtin.module {
- func.func @gemm_unit_M_unit_N() {
- %c0 = arith.constant 0 : index
- %K = hal.interface.constant.load[0] : index
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x?xf32>>{%K}
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x1xf32>>{%K}
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [1, %K], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x?xf32>>{%K} -> tensor<1x?xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%K, 1], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<?x1xf32>>{%K} -> tensor<?x1xf32>
- %init = flow.dispatch.tensor.load %result_binding, offsets = [0, 0], sizes = [1, 1], strides = [1, 1]
- : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>> -> tensor<1x1xf32>
- %gemm = linalg.matmul ins(%lhs, %rhs : tensor<1x?xf32>, tensor<?x1xf32>) outs(%init : tensor<1x1xf32>) -> tensor<1x1xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [1, 1], strides = [1, 1]
- : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @gemm_unit_M_unit_N() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x?xf32>>{%0}
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x1xf32>>{%0}
+ %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x?xf32>>{%0} -> tensor<1x?xf32>
+ %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x1xf32>>{%0} -> tensor<?x1xf32>
+ %6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>> -> tensor<1x1xf32>
+ %7 = linalg.matmul ins(%4, %5 : tensor<1x?xf32>, tensor<?x1xf32>) outs(%6 : tensor<1x1xf32>) -> tensor<1x1xf32>
+ flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0], [0, 0, 0], [0, 0, 0], [1, 32, 0], [0, 0, 16], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @gemm_unit_M_unit_N
+// CHECK: func.func @gemm_unit_M_unit_N()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 4, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
- "llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-none-elf",
- native_vector_size = 16 : index
- }
->
-hal.executable private @matmul_odd {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @matmul_odd ordinal(0) layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_odd() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<33x16xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x49xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<33x49xf32>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<33x49xf32>>
- %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [33, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<33x16xf32>> -> tensor<33x16xf32>
- %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<16x49xf32>> -> tensor<16x49xf32>
- %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<33x49xf32>> -> tensor<33x49xf32>
- %7 = tensor.empty() : tensor<33x49xf32>
- %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<33x49xf32>) -> tensor<33x49xf32>
- %9 = linalg.matmul ins(%4, %5 : tensor<33x16xf32>, tensor<16x49xf32>) outs(%8 : tensor<33x49xf32>) -> tensor<33x49xf32>
- flow.dispatch.tensor.store %9, %3, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : tensor<33x49xf32> -> !flow.dispatch.tensor<writeonly:tensor<33x49xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @matmul_odd() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<33x16xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x49xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<33x49xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<33x49xf32>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [33, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<33x16xf32>> -> tensor<33x16xf32>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<16x49xf32>> -> tensor<16x49xf32>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<33x49xf32>> -> tensor<33x49xf32>
+ %7 = tensor.empty() : tensor<33x49xf32>
+ %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<33x49xf32>) -> tensor<33x49xf32>
+ %9 = linalg.matmul ins(%4, %5 : tensor<33x16xf32>, tensor<16x49xf32>) outs(%8 : tensor<33x49xf32>) -> tensor<33x49xf32>
+ flow.dispatch.tensor.store %9, %3, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : tensor<33x49xf32> -> !flow.dispatch.tensor<writeonly:tensor<33x49xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[11, 49, 0], [11, 49, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @matmul_odd
+// CHECK: func.func @matmul_odd()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @generic_unit_dims_dynamic {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-unknown-linux-gnu",
- native_vector_size = 16 : index
- }>) {
- hal.executable.export @generic_unit_dims_dynamic layout(#pipeline_layout)
- builtin.module {
- func.func @generic_unit_dims_dynamic() {
- %c0 = arith.constant 0 : index
- %d0 = hal.interface.constant.load[0] : index
- %d1 = hal.interface.constant.load[1] : index
- %d2 = hal.interface.constant.load[2] : index
- %d3 = hal.interface.constant.load[3] : index
- %in_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
- : !flow.dispatch.tensor<readonly:tensor<1x?x1x1x?x?x1x?xf32>>{%d0, %d1, %d2, %d3}
- %result_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
- : !flow.dispatch.tensor<writeonly:tensor<1x?x1x1x?x?x1x?xf32>>{%d0, %d1, %d2, %d3}
- %in = flow.dispatch.tensor.load %in_binding, offsets=[0, 0, 0, 0, 0, 0, 0, 0],
- sizes=[1, %d0, 1, 1, %d1, %d2, 1, %d3], strides=[1, 1, 1, 1, 1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x?x1x1x?x?x1x?xf32>>{%d0, %d1, %d2, %d3} -> tensor<1x?x1x1x?x?x1x?xf32>
- %init = tensor.empty(%d0, %d1, %d2, %d3) : tensor<1x?x1x1x?x?x1x?xf32>
- %generic = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>,
- affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>],
- iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]}
- ins(%in : tensor<1x?x1x1x?x?x1x?xf32>) outs(%init : tensor<1x?x1x1x?x?x1x?xf32>) {
- ^bb0(%arg0: f32, %arg1: f32): // no predecessors
- %7 = arith.addf %arg0, %arg0 : f32
- linalg.yield %7 : f32
- } -> tensor<1x?x1x1x?x?x1x?xf32>
- flow.dispatch.tensor.store %generic, %result_binding, offsets = [0, 0, 0, 0, 0, 0, 0, 0],
- sizes = [1, %d0, 1, 1, %d1, %d2, 1, %d3], strides = [1, 1, 1, 1, 1, 1, 1, 1]
- : tensor<1x?x1x1x?x?x1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x?x1x1x?x?x1x?xf32>>{%d0, %d1, %d2, %d3}
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+#map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
+module {
+ func.func @generic_unit_dims_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.constant.load[3] : index
+ %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x?x1x1x?x?x1x?xf32>>{%0, %1, %2, %3}
+ %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x?x1x1x?x?x1x?xf32>>{%0, %1, %2, %3}
+ %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %0, 1, 1, %1, %2, 1, %3], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x?x1x1x?x?x1x?xf32>>{%0, %1, %2, %3} -> tensor<1x?x1x1x?x?x1x?xf32>
+ %7 = tensor.empty(%0, %1, %2, %3) : tensor<1x?x1x1x?x?x1x?xf32>
+ %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<1x?x1x1x?x?x1x?xf32>) outs(%7 : tensor<1x?x1x1x?x?x1x?xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %9 = arith.addf %in, %in : f32
+ linalg.yield %9 : f32
+ } -> tensor<1x?x1x1x?x?x1x?xf32>
+ flow.dispatch.tensor.store %8, %5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %0, 1, 1, %1, %2, 1, %3], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x1x1x?x?x1x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x?x1x1x?x?x1x?xf32>>{%0, %1, %2, %3}
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0, 0, 64, 64, 0, 64], [1, 1, 1, 1, 1, 1, 1, 4], [0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @generic_unit_dims_dynamic
+// CHECK: func.func @generic_unit_dims_dynamic()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @reduce_to_scalar_static {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-unknown-linux-gnu",
- native_vector_size = 16 : index
- }>) {
- hal.executable.export @reduce_to_scalar_static layout(#pipeline_layout)
- builtin.module {
- func.func @reduce_to_scalar_static() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<f32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xf32>> -> tensor<128xf32>
- %3 = tensor.empty() : tensor<f32>
- %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<f32>) -> tensor<f32>
- %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%2 : tensor<128xf32>) outs(%4 : tensor<f32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %6 = arith.addf %arg0, %arg1 : f32
- linalg.yield %6 : f32
- } -> tensor<f32>
- flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+#map = affine_map<(d0) -> (d0)>
+#map1 = affine_map<(d0) -> ()>
+module {
+ func.func @reduce_to_scalar_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<f32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xf32>> -> tensor<128xf32>
+ %3 = tensor.empty() : tensor<f32>
+ %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<f32>) -> tensor<f32>
+ %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction"]} ins(%2 : tensor<128xf32>) outs(%4 : tensor<f32>) {
+ ^bb0(%in: f32, %out: f32):
+ %6 = arith.addf %in, %out : f32
+ linalg.yield %6 : f32
+ } -> tensor<f32>
+ flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>>
+ return
+ }
+}
+
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0], [0], [4], [0]]>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+// CHECK: func.func @reduce_to_scalar_static()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
+// CHECK: linalg.generic
+// CHECK-SAME: lowering_config = #[[CONFIG]]
+
+// -----
+
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+#map = affine_map<(d0) -> (d0)>
+#map1 = affine_map<(d0) -> ()>
+module {
+ func.func @reduce_to_scalar_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%0}
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<f32>>
+ %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [%0], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%0} -> tensor<?xf32>
+ %4 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:tensor<f32>> -> tensor<f32>
+ %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction"]} ins(%3 : tensor<?xf32>) outs(%4 : tensor<f32>) {
+ ^bb0(%in: f32, %out: f32):
+ %6 = arith.addf %in, %out : f32
+ linalg.yield %6 : f32
+ } -> tensor<f32>
+ flow.dispatch.tensor.store %5, %2, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<readwrite:tensor<f32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0], [0], [4], [0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @reduce_to_scalar_static
+// CHECK: func.func @reduce_to_scalar_dynamic()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @reduce_to_scalar_dynamic {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-unknown-linux-gnu",
- native_vector_size = 16 : index
- }>) {
- hal.executable.export @reduce_to_scalar_dynamic layout(#pipeline_layout)
- builtin.module {
- func.func @reduce_to_scalar_dynamic() {
- %c0 = arith.constant 0 : index
- %d0 = hal.interface.constant.load[0] : index
- %in_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%d0}
- %out_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<f32>>
- %in = flow.dispatch.tensor.load %in_binding, offsets=[0], sizes=[%d0], strides=[1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%d0} -> tensor<?xf32>
- %out = flow.dispatch.tensor.load %out_binding, offsets=[], sizes=[], strides=[] : !flow.dispatch.tensor<readwrite:tensor<f32>> -> tensor<f32>
- %reduce = linalg.generic {
- indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>],
- iterator_types = ["reduction"]}
- ins(%in : tensor<?xf32>) outs(%out : tensor<f32>) {
- ^bb0(%arg0: f32, %arg1: f32): // no predecessors
- %7 = arith.addf %arg0, %arg1 : f32
- linalg.yield %7 : f32
- } -> tensor<f32>
- flow.dispatch.tensor.store %reduce, %out_binding, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<readwrite:tensor<f32>>
- return
- }
- }
- }
-}
-// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0], [0], [4], [0]]>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @reduce_to_scalar_dynamic
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[CONFIG]]
-
-// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @scalar {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- target_triple = "x86_64-unknown-linux-gnu",
- native_vector_size = 16 : index
- }>) {
- hal.executable.export @scalar layout(#pipeline_layout)
- builtin.module {
- func.func @scalar() {
- %c0 = arith.constant 0 : index
- %in_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<f32>>
- %out_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<f32>>
- %in = flow.dispatch.tensor.load %in_binding, offsets=[], sizes=[], strides=[] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
- %out = flow.dispatch.tensor.load %out_binding, offsets=[], sizes=[], strides=[] : !flow.dispatch.tensor<writeonly:tensor<f32>> -> tensor<f32>
- %reduce = linalg.generic {
- indexing_maps = [affine_map<() -> ()>,
- affine_map<() -> ()>],
- iterator_types = []}
- ins(%in : tensor<f32>) outs(%out : tensor<f32>) {
- ^bb0(%arg0: f32, %arg1: f32): // no predecessors
- %7 = arith.addf %arg0, %arg1 : f32
- linalg.yield %7 : f32
- } -> tensor<f32>
- flow.dispatch.tensor.store %reduce, %out_binding, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+#map = affine_map<() -> ()>
+module {
+ func.func @scalar() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<f32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<f32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
+ %3 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<writeonly:tensor<f32>> -> tensor<f32>
+ %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%2 : tensor<f32>) outs(%3 : tensor<f32>) {
+ ^bb0(%in: f32, %out: f32):
+ %5 = arith.addf %in, %out : f32
+ linalg.yield %5 : f32
+ } -> tensor<f32>
+ flow.dispatch.tensor.store %4, %1, offsets = [], sizes = [], strides = [] : tensor<f32> -> !flow.dispatch.tensor<writeonly:tensor<f32>>
+ return
}
}
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDefault>
-// CHECK: hal.executable.export public @scalar
+// CHECK: func.func @scalar()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
- "llvm-cpu",
- "embedded-elf-x86_64", {
- cpu_features = "+avx2",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 64 : index,
- target_triple = "x86_64-none-elf"
- }>
-
-
-hal.executable private @transpose_8x8 {
- hal.executable.variant @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export @transpose_8x8 layout(#pipeline_layout)
- builtin.module {
- func.func @transpose_8x8() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>> -> tensor<512x1024xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
- %6 = linalg.generic {
- indexing_maps = [ affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%3 : tensor<512x1024xf32>) outs(%5 : tensor<1024x512xf32>) {
- ^bb0(%arg1: f32, %arg2: f32):
- linalg.yield %arg1 : f32
- } -> tensor<1024x512xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d1, d0)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @transpose_8x8() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>> -> tensor<512x1024xf32>
+ %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<512x1024xf32>) outs(%3 : tensor<1024x512xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1024x512xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
+ return
}
}
@@ -1465,44 +896,23 @@
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
- "llvm-cpu",
- "embedded-elf-x86_64", {
- cpu_features = "+avx2,+avx512f",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 64 : index,
- target_triple = "x86_64-none-elf"
- }>
-
-
-hal.executable private @transpose_16x16 {
- hal.executable.variant @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export @transpose_16x16 layout(#pipeline_layout)
- builtin.module {
- func.func @transpose_16x16() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>> -> tensor<512x1024xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
- %6 = linalg.generic {
- indexing_maps = [ affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%3 : tensor<512x1024xf32>) outs(%5 : tensor<1024x512xf32>) {
- ^bb0(%arg1: f32, %arg2: f32):
- linalg.yield %arg1 : f32
- } -> tensor<1024x512xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2,+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d1, d0)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @transpose_16x16() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x1024xf32>> -> tensor<512x1024xf32>
+ %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<512x1024xf32>) outs(%3 : tensor<1024x512xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1024x512xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x512xf32>>
+ return
}
}
@@ -1511,67 +921,43 @@
// -----
-hal.executable private @multi_root {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {
- cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>) {
- hal.executable.export public @multi_root ordinal(0)
- layout(#hal.pipeline.layout<
- push_constants = 0,
- sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @multi_root() {
- %c0 = arith.constant 0 : index
- %c6144 = arith.constant 6144 : index
- %c792576 = arith.constant 792576 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0)
- : !flow.dispatch.tensor<readonly:tensor<12x128x128xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0)
- : !flow.dispatch.tensor<readonly:tensor<12x128xf32>>
- %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c792576)
- : !flow.dispatch.tensor<writeonly:tensor<12x128xf32>>
- %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 128], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<12x128x128xf32>> -> tensor<12x128x128xf32>
- %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [12, 128], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<12x128xf32>> -> tensor<12x128xf32>
- %7 = tensor.empty() : tensor<12x128xf32>
- %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<12x128xf32>) -> tensor<12x128xf32>
- %9 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel", "reduction"]}
- ins(%4 : tensor<12x128x128xf32>) outs(%5 : tensor<12x128xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %11 = arith.maximumf %arg0, %arg1 : f32
- linalg.yield %11 : f32
- } -> tensor<12x128xf32>
- %10 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>,
- affine_map<(d0, d1, d2) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel", "reduction"]}
- ins(%4, %9 : tensor<12x128x128xf32>, tensor<12x128xf32>)
- outs(%8 : tensor<12x128xf32>) {
- ^bb0(%arg0: f32, %arg1: f32, %arg3: f32):
- %11 = arith.subf %arg0, %arg1 : f32
- %12 = math.exp %11 : f32
- %13 = arith.addf %12, %arg3 : f32
- linalg.yield %13 : f32
- } -> tensor<12x128xf32>
- flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [12, 128], strides = [1, 1]
- : tensor<12x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<12x128xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module {
+ func.func @multi_root() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %c6144 = arith.constant 6144 : index
+ %c792576 = arith.constant 792576 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<12x128x128xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<12x128xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c792576) : !flow.dispatch.tensor<writeonly:tensor<12x128xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<12x128x128xf32>> -> tensor<12x128x128xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [12, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<12x128xf32>> -> tensor<12x128xf32>
+ %5 = tensor.empty() : tensor<12x128xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<12x128xf32>) -> tensor<12x128xf32>
+ %7 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3 : tensor<12x128x128xf32>) outs(%4 : tensor<12x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %9 = arith.maximumf %in, %out : f32
+ linalg.yield %9 : f32
+ } -> tensor<12x128xf32>
+ %8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %7 : tensor<12x128x128xf32>, tensor<12x128xf32>) outs(%6 : tensor<12x128xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %9 = arith.subf %in, %in_0 : f32
+ %10 = math.exp %9 : f32
+ %11 = arith.addf %10, %out : f32
+ linalg.yield %11 : f32
+ } -> tensor<12x128xf32>
+ flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [12, 128], strides = [1, 1] : tensor<12x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<12x128xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 32], [1, 4], [0, 0], [0, 0]]
// CHECK-DAG: #[[CONFIG2:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 32, 0], [1, 4, 0], [0, 0, 4], [0, 0, 0]]
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @multi_root
+// CHECK: func.func @multi_root()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG1]]
@@ -1582,127 +968,86 @@
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @pack {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {
- cpu_features = "+avx512f",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 64 : index,
- target_triple = "x86_64-none-elf"
- }>) {
- hal.executable.export public @pack layout(#pipeline_layout)
- builtin.module {
- func.func @pack() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<20x40xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x48x16x1xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [20, 40], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<20x40xf32>> -> tensor<20x40xf32>
- %3 = tensor.empty() : tensor<2x48x16x1xf32>
- %4 = tensor.pack %2 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %3 : tensor<20x40xf32> -> tensor<2x48x16x1xf32>
- flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [2, 48, 16, 1], strides = [1, 1, 1, 1] : tensor<2x48x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x48x16x1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<20x40xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x48x16x1xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [20, 40], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<20x40xf32>> -> tensor<20x40xf32>
+ %3 = tensor.empty() : tensor<2x48x16x1xf32>
+ %pack = tensor.pack %2 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %3 : tensor<20x40xf32> -> tensor<2x48x16x1xf32>
+ flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [2, 48, 16, 1], strides = [1, 1, 1, 1] : tensor<2x48x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x48x16x1xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 40], [1, 16]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
-// CHECK: hal.executable.export public @pack
+// CHECK: func.func @pack()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: tensor.pack
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @pack_many_elements {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {
- cpu_features = "+avx512f",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 64 : index,
- target_triple = "x86_64-none-elf"
- }>) {
- hal.executable.export public @pack_many_elements layout(#pipeline_layout)
- builtin.module {
- func.func @pack_many_elements() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1200x500000xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<31250x1200x16x1xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1200, 500000], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1200x500000xf32>> -> tensor<1200x500000xf32>
- %3 = tensor.empty() : tensor<31250x1200x16x1xf32>
- %pack = tensor.pack %2 outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %3 : tensor<1200x500000xf32> -> tensor<31250x1200x16x1xf32>
- flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [31250, 1200, 16, 1], strides = [1, 1, 1, 1] : tensor<31250x1200x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<31250x1200x16x1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @pack_many_elements() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1200x500000xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<31250x1200x16x1xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1200, 500000], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1200x500000xf32>> -> tensor<1200x500000xf32>
+ %3 = tensor.empty() : tensor<31250x1200x16x1xf32>
+ %pack = tensor.pack %2 outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %3 : tensor<1200x500000xf32> -> tensor<31250x1200x16x1xf32>
+ flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [31250, 1200, 16, 1], strides = [1, 1, 1, 1] : tensor<31250x1200x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<31250x1200x16x1xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[100, 31250], [1, 1]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
-// CHECK: hal.executable.export public @pack_many_elements
+// CHECK: func.func @pack_many_elements()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: tensor.pack
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- <0, bindings = [
- <0, storage_buffer, ReadOnly>,
- <1, storage_buffer, ReadOnly>,
- <2, storage_buffer>
- ]>
-]>
-hal.executable private @unpack_generic_pack {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {
- cpu_features = "+avx512f",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index,
- target_triple = "x86_64-none-elf"
- }>) {
- hal.executable.export public @unpack_generic_pack layout(#pipeline_layout)
- builtin.module {
- func.func @unpack_generic_pack(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 3.40282347E+38 : f32
- %cst_0 = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<24x32x16x16xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<24x512x16x1xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [24, 32, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<24x32x16x16xf32>> -> tensor<24x32x16x16xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [512], strides = [1] : !flow.dispatch.tensor<readonly:tensor<512xf32>> -> tensor<512xf32>
- %5 = tensor.empty() : tensor<24x512x16x1xf32>
- %6 = tensor.empty() : tensor<384x512xf32>
- %unpack = tensor.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %6 : tensor<24x32x16x16xf32> -> tensor<384x512xf32>
- %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4, %unpack : tensor<512xf32>, tensor<384x512xf32>) outs(%6 : tensor<384x512xf32>) {
- ^bb0(%in: f32, %in_1: f32, %out: f32):
- %8 = arith.addf %in, %in_1 : f32
- %9 = arith.minimumf %8, %cst : f32
- %10 = arith.maximumf %9, %cst_0 : f32
- linalg.yield %10 : f32
- } -> tensor<384x512xf32>
- %pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %5 : tensor<384x512xf32> -> tensor<24x512x16x1xf32>
- flow.dispatch.tensor.store %pack, %2, offsets = [0, 0, 0, 0], sizes = [24, 512, 16, 1], strides = [1, 1, 1, 1] : tensor<24x512x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<24x512x16x1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d1)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @unpack_generic_pack(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 3.40282347E+38 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<24x32x16x16xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<24x512x16x1xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [24, 32, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<24x32x16x16xf32>> -> tensor<24x32x16x16xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [512], strides = [1] : !flow.dispatch.tensor<readonly:tensor<512xf32>> -> tensor<512xf32>
+ %5 = tensor.empty() : tensor<24x512x16x1xf32>
+ %6 = tensor.empty() : tensor<384x512xf32>
+ %unpack = tensor.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %6 : tensor<24x32x16x16xf32> -> tensor<384x512xf32>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%4, %unpack : tensor<512xf32>, tensor<384x512xf32>) outs(%6 : tensor<384x512xf32>) {
+ ^bb0(%in: f32, %in_1: f32, %out: f32):
+ %8 = arith.addf %in, %in_1 : f32
+ %9 = arith.minimumf %8, %cst : f32
+ %10 = arith.maximumf %9, %cst_0 : f32
+ linalg.yield %10 : f32
+ } -> tensor<384x512xf32>
+ %pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %5 : tensor<384x512xf32> -> tensor<24x512x16x1xf32>
+ flow.dispatch.tensor.store %pack, %2, offsets = [0, 0, 0, 0], sizes = [24, 512, 16, 1], strides = [1, 1, 1, 1] : tensor<24x512x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<24x512x16x1xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 64], [1, 4], [0, 0], [0, 0]]>
// CHECK-DAG: #[[CONFIG2:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [16, 4], [0, 0], [0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @unpack_generic_pack
+// CHECK: func.func @unpack_generic_pack(
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: tensor.unpack
// CHECK-SAME: lowering_config = #[[CONFIG2]]
@@ -1713,39 +1058,31 @@
// -----
-hal.executable private @elem_pack {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>) {
- hal.executable.export public @elem_pack ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %c1 = arith.constant 1 : index
- %0 = affine.apply affine_map<()[s0] -> ((s0 ceildiv 8) ceildiv 64)>()[%arg1]
- %1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2]
- hal.return %1, %0, %c1 : index, index, index
- }
- builtin.module {
- func.func @elem_pack() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x384xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x384x8x1xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x384xf32>> -> tensor<128x384xf32>
- %3 = tensor.empty() : tensor<128x384xf32>
- %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<128x384xf32>) outs(%3 : tensor<128x384xf32>) {
- ^bb0(%in: f32, %out: f32):
- %7 = arith.addf %in, %in : f32
- linalg.yield %7 : f32
- } -> tensor<128x384xf32>
- %5 = tensor.empty() : tensor<16x384x8x1xf32>
- %6 = tensor.pack %4 inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %5 : tensor<128x384xf32> -> tensor<16x384x8x1xf32>
- flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0], sizes = [16, 384, 8, 1], strides = [1, 1, 1, 1] : tensor<16x384x8x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<16x384x8x1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @elem_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x384xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x384x8x1xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x384xf32>> -> tensor<128x384xf32>
+ %3 = tensor.empty() : tensor<128x384xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<128x384xf32>) outs(%3 : tensor<128x384xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %6 = arith.addf %in, %in : f32
+ linalg.yield %6 : f32
+ } -> tensor<128x384xf32>
+ %5 = tensor.empty() : tensor<16x384x8x1xf32>
+ %pack = tensor.pack %4 inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %5 : tensor<128x384xf32> -> tensor<16x384x8x1xf32>
+ flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [16, 384, 8, 1], strides = [1, 1, 1, 1] : tensor<16x384x8x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<16x384x8x1xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [8, 1], [0, 0], [0, 0]]>
// CHECK-DAG: #[[CONFIG2:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[8, 64], [1, 1], [0, 0], [0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @elem_pack
+// CHECK: func.func @elem_pack()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG1]]
@@ -1754,41 +1091,33 @@
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @transpose_pack {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf", ukernels = false}>) {
- hal.executable.export @transpose_pack layout(#pipeline_layout)
- builtin.module {
- func.func @transpose_pack() {
- %c1579008 = arith.constant 1579008 : index
- %c3147776 = arith.constant 3147776 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c1579008) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<30522x768xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c3147776) : !flow.dispatch.tensor<writeonly:tensor<1908x768x16x1xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [30522, 768], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<30522x768xf32>> -> tensor<30522x768xf32>
- %3 = tensor.empty() : tensor<768x30522xf32>
- %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<30522x768xf32>) outs(%3 : tensor<768x30522xf32>) {
- ^bb0(%in: f32, %out: f32):
- linalg.yield %in : f32
- } -> tensor<768x30522xf32>
- %5 = tensor.empty() : tensor<1908x768x16x1xf32>
- %pack = tensor.pack %4 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %5 : tensor<768x30522xf32> -> tensor<1908x768x16x1xf32>
- flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [1908, 768, 16, 1], strides = [1, 1, 1, 1] : tensor<1908x768x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<1908x768x16x1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf", ukernels = false}>
+#map = affine_map<(d0, d1) -> (d1, d0)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @transpose_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c1579008 = arith.constant 1579008 : index
+ %c3147776 = arith.constant 3147776 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c1579008) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<30522x768xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c3147776) : !flow.dispatch.tensor<writeonly:tensor<1908x768x16x1xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [30522, 768], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<30522x768xf32>> -> tensor<30522x768xf32>
+ %3 = tensor.empty() : tensor<768x30522xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<30522x768xf32>) outs(%3 : tensor<768x30522xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<768x30522xf32>
+ %5 = tensor.empty() : tensor<1908x768x16x1xf32>
+ %pack = tensor.pack %4 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %5 : tensor<768x30522xf32> -> tensor<1908x768x16x1xf32>
+ flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [1908, 768, 16, 1], strides = [1, 1, 1, 1] : tensor<1908x768x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<1908x768x16x1xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [1, 16], [0, 0], [0, 0]]>
// CHECK-DAG: #[[CONFIG2:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 64], [1, 1], [0, 0], [0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @transpose_pack
+// CHECK: func.func @transpose_pack()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG1]]
@@ -1797,67 +1126,60 @@
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @reduction_broadcast_pack {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf", ukernels = false}>) {
- hal.executable.export @reduction_broadcast_pack layout(#pipeline_layout)
- builtin.module {
- func.func @reduction_broadcast_pack() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant -0.000000e+00 : f32
- %cst_0 = arith.constant 1.024000e+03 : f32
- %cst_1 = arith.constant 9.99999996E-13 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<384x1024xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<384xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024xf32>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024xf32>>
- %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<24x1024x16x1xf32>>
- %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x1024xf32>> -> tensor<384x1024xf32>
- %6 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor<readonly:tensor<384xf32>> -> tensor<384xf32>
- %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [1024], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1024xf32>> -> tensor<1024xf32>
- %8 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [1024], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1024xf32>> -> tensor<1024xf32>
- %9 = tensor.empty() : tensor<24x1024x16x1xf32>
- %10 = tensor.empty() : tensor<384x1024xf32>
- %11 = tensor.empty() : tensor<384xf32>
- %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<384xf32>) -> tensor<384xf32>
- %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%5, %6 : tensor<384x1024xf32>, tensor<384xf32>) outs(%12 : tensor<384xf32>) {
- ^bb0(%in: f32, %in_2: f32, %out: f32):
- %15 = arith.subf %in, %in_2 : f32
- %16 = arith.mulf %15, %15 : f32
- %17 = arith.addf %out, %16 : f32
- linalg.yield %17 : f32
- } -> tensor<384xf32>
- %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5, %13, %7, %8, %6 : tensor<384x1024xf32>, tensor<384xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<384xf32>) outs(%10 : tensor<384x1024xf32>) {
- ^bb0(%in: f32, %in_2: f32, %in_3: f32, %in_4: f32, %in_5: f32, %out: f32):
- %15 = arith.divf %in_2, %cst_0 : f32
- %16 = arith.addf %15, %cst_1 : f32
- %17 = math.rsqrt %16 : f32
- %18 = arith.mulf %17, %in_3 : f32
- %19 = arith.mulf %in_5, %18 : f32
- %20 = arith.subf %in_4, %19 : f32
- %21 = arith.mulf %in, %18 : f32
- %22 = arith.addf %21, %20 : f32
- linalg.yield %22 : f32
- } -> tensor<384x1024xf32>
- %pack = tensor.pack %14 inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %9 : tensor<384x1024xf32> -> tensor<24x1024x16x1xf32>
- flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [24, 1024, 16, 1], strides = [1, 1, 1, 1] : tensor<24x1024x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<24x1024x16x1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf", ukernels = false}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+#map2 = affine_map<(d0, d1) -> (d1)>
+module {
+ func.func @reduction_broadcast_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant -0.000000e+00 : f32
+ %cst_0 = arith.constant 1.024000e+03 : f32
+ %cst_1 = arith.constant 9.99999996E-13 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<384x1024xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<384xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024xf32>>
+ %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<24x1024x16x1xf32>>
+ %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x1024xf32>> -> tensor<384x1024xf32>
+ %6 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor<readonly:tensor<384xf32>> -> tensor<384xf32>
+ %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [1024], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1024xf32>> -> tensor<1024xf32>
+ %8 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [1024], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1024xf32>> -> tensor<1024xf32>
+ %9 = tensor.empty() : tensor<24x1024x16x1xf32>
+ %10 = tensor.empty() : tensor<384x1024xf32>
+ %11 = tensor.empty() : tensor<384xf32>
+ %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<384xf32>) -> tensor<384xf32>
+ %13 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "reduction"]} ins(%5, %6 : tensor<384x1024xf32>, tensor<384xf32>) outs(%12 : tensor<384xf32>) {
+ ^bb0(%in: f32, %in_2: f32, %out: f32):
+ %15 = arith.subf %in, %in_2 : f32
+ %16 = arith.mulf %15, %15 : f32
+ %17 = arith.addf %out, %16 : f32
+ linalg.yield %17 : f32
+ } -> tensor<384xf32>
+ %14 = linalg.generic {indexing_maps = [#map, #map1, #map2, #map2, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %13, %7, %8, %6 : tensor<384x1024xf32>, tensor<384xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<384xf32>) outs(%10 : tensor<384x1024xf32>) {
+ ^bb0(%in: f32, %in_2: f32, %in_3: f32, %in_4: f32, %in_5: f32, %out: f32):
+ %15 = arith.divf %in_2, %cst_0 : f32
+ %16 = arith.addf %15, %cst_1 : f32
+ %17 = math.rsqrt %16 : f32
+ %18 = arith.mulf %17, %in_3 : f32
+ %19 = arith.mulf %in_5, %18 : f32
+ %20 = arith.subf %in_4, %19 : f32
+ %21 = arith.mulf %in, %18 : f32
+ %22 = arith.addf %21, %20 : f32
+ linalg.yield %22 : f32
+ } -> tensor<384x1024xf32>
+ %pack = tensor.pack %14 inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %9 : tensor<384x1024xf32> -> tensor<24x1024x16x1xf32>
+ flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [24, 1024, 16, 1], strides = [1, 1, 1, 1] : tensor<24x1024x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<24x1024x16x1xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32], [16], [0], [0]]>
// CHECK-DAG: #[[CONFIG2:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 0], [16, 0], [0, 16], [0, 0]]>
// CHECK-DAG: #[[CONFIG3:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 0], [16, 0], [0, 0], [0, 1]]>
// CHECK-DAG: #[[CONFIG4:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[2, 0], [1, 0], [0, 0], [0, 1]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @reduction_broadcast_pack
+// CHECK: func.func @reduction_broadcast_pack()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG1]]
@@ -1870,48 +1192,40 @@
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @reduction_pack {
- hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf", ukernels = false}>) {
- hal.executable.export @reduction_pack layout(#pipeline_layout)
- builtin.module {
- func.func @reduction_pack() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant -0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<384x1024x32xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<384x1024xf32>>
- %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x24x16x1xf32>>
- %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [384, 1024, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<384x1024x32xf32>> -> tensor<384x1024x32xf32>
- %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x1024xf32>> -> tensor<384x1024xf32>
- %9 = tensor.empty() : tensor<1024x24x16x1xf32>
- %10 = tensor.empty() : tensor<384x1024x32xf32>
- %11 = tensor.empty() : tensor<384x1024xf32>
- %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<384x1024xf32>) -> tensor<384x1024xf32>
- %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %6 : tensor<384x1024x32xf32>, tensor<384x1024xf32>) outs(%12 : tensor<384x1024xf32>) {
- ^bb0(%in: f32, %in_2: f32, %out: f32):
- %15 = arith.subf %in, %in_2 : f32
- %16 = arith.mulf %15, %15 : f32
- %17 = arith.addf %out, %16 : f32
- linalg.yield %17 : f32
- } -> tensor<384x1024xf32>
- %pack = tensor.pack %13 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %9 : tensor<384x1024xf32> -> tensor<1024x24x16x1xf32>
- flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [1024, 24, 16, 1], strides = [1, 1, 1, 1] : tensor<1024x24x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x24x16x1xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf", ukernels = false}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module {
+ func.func @reduction_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant -0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<384x1024x32xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<384x1024xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x24x16x1xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [384, 1024, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<384x1024x32xf32>> -> tensor<384x1024x32xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x1024xf32>> -> tensor<384x1024xf32>
+ %5 = tensor.empty() : tensor<1024x24x16x1xf32>
+ %6 = tensor.empty() : tensor<384x1024x32xf32>
+ %7 = tensor.empty() : tensor<384x1024xf32>
+ %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<384x1024xf32>) -> tensor<384x1024xf32>
+ %9 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<384x1024x32xf32>, tensor<384x1024xf32>) outs(%8 : tensor<384x1024xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %10 = arith.subf %in, %in_0 : f32
+ %11 = arith.mulf %10, %10 : f32
+ %12 = arith.addf %out, %11 : f32
+ linalg.yield %12 : f32
+ } -> tensor<384x1024xf32>
+ %pack = tensor.pack %9 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %5 : tensor<384x1024xf32> -> tensor<1024x24x16x1xf32>
+ flow.dispatch.tensor.store %pack, %2, offsets = [0, 0, 0, 0], sizes = [1024, 24, 16, 1], strides = [1, 1, 1, 1] : tensor<1024x24x16x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x24x16x1xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32], [16, 1], [0, 0], [0, 0]]>
// CHECK-DAG: #[[CONFIG2:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32, 0], [16, 1, 0], [0, 0, 16], [0, 0, 0]]>
// CHECK-DAG: #[[CONFIG3:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 2], [1, 1], [0, 0], [0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @reduction_pack
+// CHECK: func.func @reduction_pack()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG1]]
@@ -1922,155 +1236,126 @@
// -----
-hal.executable private @unpack_static {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>) {
- hal.executable.export public @unpack_static ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @unpack_static() {
- %c41943040 = arith.constant 41943040 : index
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c41943040) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x256x16x16xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x4096xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [64, 256, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x256x16x16xf32>> -> tensor<64x256x16x16xf32>
- %3 = tensor.empty() : tensor<1024x4096xf32>
- %unpack = tensor.unpack %2 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %3 : tensor<64x256x16x16xf32> -> tensor<1024x4096xf32>
- flow.dispatch.tensor.store %unpack, %1, offsets = [0, 0], sizes = [1024, 4096], strides = [1, 1] : tensor<1024x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x4096xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @unpack_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c41943040 = arith.constant 41943040 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c41943040) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x256x16x16xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x4096xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [64, 256, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x256x16x16xf32>> -> tensor<64x256x16x16xf32>
+ %3 = tensor.empty() : tensor<1024x4096xf32>
+ %unpack = tensor.unpack %2 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %3 : tensor<64x256x16x16xf32> -> tensor<1024x4096xf32>
+ flow.dispatch.tensor.store %unpack, %1, offsets = [0, 0], sizes = [1024, 4096], strides = [1, 1] : tensor<1024x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x4096xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 4096], [16, 16]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
-// CHECK: hal.executable.export public @unpack_static
+// CHECK: func.func @unpack_static()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: tensor.unpack
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-hal.executable private @unpack_elem {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>) {
- hal.executable.export public @unpack_elem ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @unpack_elem() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<48x64x8x2xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x384xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [48, 64, 8, 2], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<48x64x8x2xf32>> -> tensor<48x64x8x2xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xf32>> -> tensor<128xf32>
- %5 = tensor.empty() : tensor<128x384xf32>
- %6 = tensor.empty() : tensor<384x128xf32>
- %unpack = tensor.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %6 : tensor<48x64x8x2xf32> -> tensor<384x128xf32>
- %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4, %unpack : tensor<128xf32>, tensor<384x128xf32>) outs(%5 : tensor<128x384xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %8 = arith.addf %in, %in_0 : f32
- linalg.yield %8 : f32
- } -> tensor<128x384xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : tensor<128x384xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x384xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d0)>
+#map1 = affine_map<(d0, d1) -> (d1, d0)>
+#map2 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @unpack_elem() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<48x64x8x2xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x384xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [48, 64, 8, 2], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<48x64x8x2xf32>> -> tensor<48x64x8x2xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xf32>> -> tensor<128xf32>
+ %5 = tensor.empty() : tensor<128x384xf32>
+ %6 = tensor.empty() : tensor<384x128xf32>
+ %unpack = tensor.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %6 : tensor<48x64x8x2xf32> -> tensor<384x128xf32>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel"]} ins(%4, %unpack : tensor<128xf32>, tensor<384x128xf32>) outs(%5 : tensor<128x384xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %8 = arith.addf %in, %in_0 : f32
+ linalg.yield %8 : f32
+ } -> tensor<128x384xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : tensor<128x384xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x384xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [4, 8], [0, 0], [0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @unpack_elem
+// CHECK: func.func @unpack_elem()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-hal.executable private @quant_model {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>) {
- hal.executable.export public @quant_model ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer, ReadOnly>, <4, storage_buffer, ReadOnly>, <5, storage_buffer, ReadOnly>, <6, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @quant_model() {
- %c0 = arith.constant 0 : index
- %c12_i32 = arith.constant 12 : i32
- %c-128_i32 = arith.constant -128 : i32
- %c127_i32 = arith.constant 127 : i32
- %c0_i32 = arith.constant 0 : i32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2304x24xi8>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<24x144xi8>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<144xi32>>
- %6 = hal.interface.binding.subspan set(0) binding(6) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2304x144xi8>>
- %7 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2304, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2304x24xi8>> -> tensor<2304x24xi8>
- %8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<24x144xi8>> -> tensor<24x144xi8>
- %9 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [144], strides = [1] : !flow.dispatch.tensor<readonly:tensor<144xi32>> -> tensor<144xi32>
- %13 = tensor.empty() : tensor<2304x144xi8>
- %14 = tensor.empty() : tensor<2304x144xi32>
- %15 = linalg.fill ins(%c0_i32 : i32) outs(%14 : tensor<2304x144xi32>) -> tensor<2304x144xi32>
- %16 = linalg.matmul ins(%7, %8 : tensor<2304x24xi8>, tensor<24x144xi8>) outs(%15 : tensor<2304x144xi32>) -> tensor<2304x144xi32>
- %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9, %16 : tensor<144xi32>, tensor<2304x144xi32>) outs(%13 : tensor<2304x144xi8>) {
- ^bb0(%in: i32, %in_0: i32, %out: i8):
- %19 = arith.subi %in_0, %c12_i32 : i32
- %20 = arith.addi %in, %19 : i32
- %27 = arith.trunci %20 : i32 to i8
- linalg.yield %27 : i8
- } -> tensor<2304x144xi8>
- flow.dispatch.tensor.store %17, %6, offsets = [0, 0], sizes = [2304, 144], strides = [1, 1] : tensor<2304x144xi8> -> !flow.dispatch.tensor<writeonly:tensor<2304x144xi8>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+#map = affine_map<(d0, d1) -> (d1)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @quant_model() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %c12_i32 = arith.constant 12 : i32
+ %c-128_i32 = arith.constant -128 : i32
+ %c127_i32 = arith.constant 127 : i32
+ %c0_i32 = arith.constant 0 : i32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2304x24xi8>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<24x144xi8>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<144xi32>>
+ %3 = hal.interface.binding.subspan set(0) binding(6) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2304x144xi8>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2304, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2304x24xi8>> -> tensor<2304x24xi8>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<24x144xi8>> -> tensor<24x144xi8>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [144], strides = [1] : !flow.dispatch.tensor<readonly:tensor<144xi32>> -> tensor<144xi32>
+ %7 = tensor.empty() : tensor<2304x144xi8>
+ %8 = tensor.empty() : tensor<2304x144xi32>
+ %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2304x144xi32>) -> tensor<2304x144xi32>
+ %10 = linalg.matmul ins(%4, %5 : tensor<2304x24xi8>, tensor<24x144xi8>) outs(%9 : tensor<2304x144xi32>) -> tensor<2304x144xi32>
+ %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %10 : tensor<144xi32>, tensor<2304x144xi32>) outs(%7 : tensor<2304x144xi8>) {
+ ^bb0(%in: i32, %in_0: i32, %out: i8):
+ %12 = arith.subi %in_0, %c12_i32 : i32
+ %13 = arith.addi %in, %12 : i32
+ %14 = arith.trunci %13 : i32 to i8
+ linalg.yield %14 : i8
+ } -> tensor<2304x144xi8>
+ flow.dispatch.tensor.store %11, %3, offsets = [0, 0], sizes = [2304, 144], strides = [1, 1] : tensor<2304x144xi8> -> !flow.dispatch.tensor<writeonly:tensor<2304x144xi8>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 48, 0], [64, 48, 0], [0, 0, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @quant_model
+// CHECK: func.func @quant_model()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-hal.executable private @no_compute_ops {
- hal.executable.variant public @embedded_elf_x86_64 target(<
- "llvm-cpu", "embedded-elf-x86_64",
- {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = false}>) {
- hal.executable.export public @test ordinal(0)
- layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device):
- %c1 = arith.constant 1 : index
- hal.return %c1, %c1, %c1 : index, index, index
- }
- builtin.module {
- func.func @test() {
- %c0 = arith.constant 0 : index
- %c6364136223846793005_i64 = arith.constant 6364136223846793005 : i64
- %c1442695040888963407_i64 = arith.constant 1442695040888963407 : i64
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<i64>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<i64>>
- %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<i64>> -> tensor<i64>
- %extracted = tensor.extract %2[] : tensor<i64>
- %3 = arith.muli %extracted, %c6364136223846793005_i64 : i64
- %4 = arith.addi %3, %c1442695040888963407_i64 : i64
- %inserted = tensor.insert %4 into %2[] : tensor<i64>
- flow.dispatch.tensor.store %inserted, %1, offsets = [], sizes = [], strides = [] : tensor<i64> -> !flow.dispatch.tensor<writeonly:tensor<i64>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = false}>
+module {
+ func.func @test() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %c6364136223846793005_i64 = arith.constant 6364136223846793005 : i64
+ %c1442695040888963407_i64 = arith.constant 1442695040888963407 : i64
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<i64>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<i64>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<i64>> -> tensor<i64>
+ %extracted = tensor.extract %2[] : tensor<i64>
+ %3 = arith.muli %extracted, %c6364136223846793005_i64 : i64
+ %4 = arith.addi %3, %c1442695040888963407_i64 : i64
+ %inserted = tensor.insert %4 into %2[] : tensor<i64>
+ flow.dispatch.tensor.store %inserted, %1, offsets = [], sizes = [], strides = [] : tensor<i64> -> !flow.dispatch.tensor<writeonly:tensor<i64>>
+ return
}
}
+
// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDefault>
-// CHECK: hal.executable private @no_compute_ops
-// CHECK: hal.executable.export public @test
+// CHECK: func.func @test()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// -----
@@ -2078,109 +1363,79 @@
#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", link_embedded = false, native_vector_size = 64 : index, target_triple = "x86_64-unknown-linux-gnu", ukernels = false}>
#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1) -> (d0)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
-
-hal.executable private @non_trivial_program {
- hal.executable.variant public @system_elf_x86_64 target(#executable_target_system_elf_x86_64_) {
- hal.executable.export public @non_trivial_program ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device):
- %c1 = arith.constant 1 : index
- hal.return %c1, %c1, %c1 : index, index, index
- }
- builtin.module {
- func.func @non_trivial_program() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x1x128x1xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x1xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [128, 1, 128, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x1x128x1xf32>> -> tensor<128x1x128x1xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x1xf32>> -> tensor<128x1xf32>
- %5 = tensor.empty() : tensor<1x1xf32>
- %6 = tensor.empty() : tensor<128xf32>
- %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<128xf32>) -> tensor<128xf32>
- %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32>
- %collapsed = tensor.collapse_shape %3 [[0, 1], [2, 3]] : tensor<128x1x128x1xf32> into tensor<128x128xf32>
- %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%collapsed : tensor<128x128xf32>) outs(%7 : tensor<128xf32>) {
- ^bb0(%in: f32, %out: f32):
- %11 = arith.addf %out, %in : f32
- linalg.yield %11 : f32
- } -> tensor<128xf32>
- %expanded = tensor.expand_shape %9 [[0, 1]] : tensor<128xf32> into tensor<1x128xf32>
- %10 = linalg.matmul ins(%expanded, %4 : tensor<1x128xf32>, tensor<128x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
- flow.dispatch.tensor.store %10, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x1xf32>>
- return
- }
- }
+module {
+ func.func @non_trivial_program() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x1x128x1xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x1xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [128, 1, 128, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x1x128x1xf32>> -> tensor<128x1x128x1xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x1xf32>> -> tensor<128x1xf32>
+ %5 = tensor.empty() : tensor<1x1xf32>
+ %6 = tensor.empty() : tensor<128xf32>
+ %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<128xf32>) -> tensor<128xf32>
+ %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32>
+ %collapsed = tensor.collapse_shape %3 [[0, 1], [2, 3]] : tensor<128x1x128x1xf32> into tensor<128x128xf32>
+ %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%collapsed : tensor<128x128xf32>) outs(%7 : tensor<128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %11 = arith.addf %out, %in : f32
+ linalg.yield %11 : f32
+ } -> tensor<128xf32>
+ %expanded = tensor.expand_shape %9 [[0, 1]] : tensor<128xf32> into tensor<1x128xf32>
+ %10 = linalg.matmul ins(%expanded, %4 : tensor<1x128xf32>, tensor<128x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
+ flow.dispatch.tensor.store %10, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x1xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0], [0, 0, 0], [0, 0, 0], [1, 32, 0], [0, 0, 16], [0, 0, 0]]>
// CHECK-NOT: lowering_config
-// CHECK: hal.executable.export public @non_trivial_program
+// CHECK: func.func @non_trivial_program()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"
-}>
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
#map = affine_map<(d0, d1, d2) -> (d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map3 = affine_map<(d0, d1, d2) -> (d0)>
module {
- hal.executable public @i4_dequant_matvec {
- hal.executable.variant public @llvm target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @i4_dequant_matvec_f32 layout(#pipeline_layout)
- builtin.module {
- func.func @i4_dequant_matvec_f32() {
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<86x128xf32>>
- %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
- %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
- %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
- %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
- %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<86x128xf32>> -> tensor<86x128xf32>
- %9 = tensor.empty() : tensor<4096xf32>
- %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4096xf32>) -> tensor<4096xf32>
- %11 = linalg.generic {indexing_maps = [#map, #map1, #map2, #map2, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %5, %6, %7 : tensor<86x128xf32>, tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%10 : tensor<4096xf32>) {
- ^bb0(%in: f32, %in_0: i4, %in_1: f32, %in_2: f32, %out: f32):
- %12 = arith.extui %in_0 : i4 to i32
- %13 = arith.uitofp %12 : i32 to f32
- %14 = arith.subf %13, %in_2 : f32
- %15 = arith.mulf %14, %in_1 : f32
- %16 = arith.mulf %in, %15 : f32
- %17 = arith.addf %16, %out : f32
- linalg.yield %17 : f32
- } -> tensor<4096xf32>
- flow.dispatch.tensor.store %11, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
- return
- }
- }
- }
+ func.func @i4_dequant_matvec_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<86x128xf32>>
+ %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
+ %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
+ %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
+ %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
+ %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<86x128xf32>> -> tensor<86x128xf32>
+ %9 = tensor.empty() : tensor<4096xf32>
+ %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4096xf32>) -> tensor<4096xf32>
+ %11 = linalg.generic {indexing_maps = [#map, #map1, #map2, #map2, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %5, %6, %7 : tensor<86x128xf32>, tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%10 : tensor<4096xf32>) {
+ ^bb0(%in: f32, %in_0: i4, %in_1: f32, %in_2: f32, %out: f32):
+ %12 = arith.extui %in_0 : i4 to i32
+ %13 = arith.uitofp %12 : i32 to f32
+ %14 = arith.subf %13, %in_2 : f32
+ %15 = arith.mulf %14, %in_1 : f32
+ %16 = arith.mulf %in, %15 : f32
+ %17 = arith.addf %16, %out : f32
+ linalg.yield %17 : f32
+ } -> tensor<4096xf32>
+ flow.dispatch.tensor.store %11, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32], [4], [0], [0]]>
// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 0, 0], [4, 0, 0], [0, 4, 16], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @i4_dequant_matvec_f32
+// CHECK: func.func @i4_dequant_matvec_f32()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -2190,46 +1445,35 @@
// -----
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 28, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer, ReadOnly>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @batch_mmt4d {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @batch_mmt4d ordinal(0) layout(#pipeline_layout)
- builtin.module {
- func.func @batch_mmt4d() {
- %c32_i64 = arith.constant 32 : i64
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = arith.extui %0 : i32 to i64
- %5 = arith.extui %1 : i32 to i64
- %6 = arith.shli %5, %c32_i64 : i64
- %7 = arith.ori %4, %6 : i64
- %8 = arith.index_castui %7 {stream.alignment = 64 : index} : i64 to index
- %9 = arith.extui %2 : i32 to i64
- %10 = arith.extui %3 : i32 to i64
- %11 = arith.shli %10, %c32_i64 : i64
- %12 = arith.ori %9, %11 : i64
- %13 = arith.index_castui %12 : i64 to index
- %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>>
- %15 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>>
- %16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%13) : !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
- %17 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 32, 8, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>> -> tensor<128x10x32x8x1xf32>
- %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0, 0, 0], sizes = [128, 80, 32, 4, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>> -> tensor<128x80x32x4x1xf32>
- %19 = tensor.empty() : tensor<128x10x80x8x4xf32>
- %20 = linalg.fill ins(%cst : f32) outs(%19 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
- %21 = linalg.batch_mmt4d ins(%17, %18 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%20 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
- flow.dispatch.tensor.store %21, %16, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 80, 8, 4], strides = [1, 1, 1, 1, 1] : tensor<128x10x80x8x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
- return
- }
- }
+module {
+ func.func @batch_mmt4d() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c32_i64 = arith.constant 32 : i64
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = arith.extui %0 : i32 to i64
+ %5 = arith.extui %1 : i32 to i64
+ %6 = arith.shli %5, %c32_i64 : i64
+ %7 = arith.ori %4, %6 : i64
+ %8 = arith.index_castui %7 {stream.alignment = 64 : index} : i64 to index
+ %9 = arith.extui %2 : i32 to i64
+ %10 = arith.extui %3 : i32 to i64
+ %11 = arith.shli %10, %c32_i64 : i64
+ %12 = arith.ori %9, %11 : i64
+ %13 = arith.index_castui %12 : i64 to index
+ %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>>
+ %15 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>>
+ %16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%13) : !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
+ %17 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 32, 8, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x10x32x8x1xf32>> -> tensor<128x10x32x8x1xf32>
+ %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0, 0, 0], sizes = [128, 80, 32, 4, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x80x32x4x1xf32>> -> tensor<128x80x32x4x1xf32>
+ %19 = tensor.empty() : tensor<128x10x80x8x4xf32>
+ %20 = linalg.fill ins(%cst : f32) outs(%19 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
+ %21 = linalg.batch_mmt4d ins(%17, %18 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%20 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32>
+ flow.dispatch.tensor.store %21, %16, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 80, 8, 4], strides = [1, 1, 1, 1, 1] : tensor<128x10x80x8x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x10x80x8x4xf32>>
+ return
}
}
@@ -2241,31 +1485,20 @@
// -----
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 28, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer, ReadOnly>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @mmt4d_with_large_reduction {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @mmt4d_with_large_reduction ordinal(0) layout(#pipeline_layout)
- builtin.module {
- func.func @mmt4d_with_large_reduction() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<7x18176x16x1xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<284x18176x16x1xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<7x284x16x16xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [7, 18176, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<7x18176x16x1xf32>> -> tensor<7x18176x16x1xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [284, 18176, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<284x18176x16x1xf32>> -> tensor<284x18176x16x1xf32>
- %5 = tensor.empty() : tensor<7x284x16x16xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<7x284x16x16xf32>) -> tensor<7x284x16x16xf32>
- %7 = linalg.mmt4d ins(%3, %4 : tensor<7x18176x16x1xf32>, tensor<284x18176x16x1xf32>) outs(%6 : tensor<7x284x16x16xf32>) -> tensor<7x284x16x16xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [7, 284, 16, 16], strides = [1, 1, 1, 1] : tensor<7x284x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<7x284x16x16xf32>>
- return
- }
- }
+module {
+ func.func @mmt4d_with_large_reduction() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<7x18176x16x1xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<284x18176x16x1xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<7x284x16x16xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [7, 18176, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<7x18176x16x1xf32>> -> tensor<7x18176x16x1xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [284, 18176, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<284x18176x16x1xf32>> -> tensor<284x18176x16x1xf32>
+ %5 = tensor.empty() : tensor<7x284x16x16xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<7x284x16x16xf32>) -> tensor<7x284x16x16xf32>
+ %7 = linalg.mmt4d ins(%3, %4 : tensor<7x18176x16x1xf32>, tensor<284x18176x16x1xf32>) outs(%6 : tensor<7x284x16x16xf32>) -> tensor<7x284x16x16xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [7, 284, 16, 16], strides = [1, 1, 1, 1] : tensor<7x284x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<7x284x16x16xf32>>
+ return
}
}
@@ -2276,87 +1509,60 @@
// -----
-hal.executable private @pad_only {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {
- cpu = "generic", cpu_features = "",
- data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>) {
- hal.executable.export public @pad_only ordinal(0)
- layout(#hal.pipeline.layout<push_constants = 0,
- sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @pad_only() {
- %c634816 = arith.constant 634816 : index
- %c3846080 = arith.constant 3846080 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c634816) flags(ReadOnly)
- : !flow.dispatch.tensor<readonly:tensor<1x112x112x64xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c3846080)
- : !flow.dispatch.tensor<writeonly:tensor<1x114x114x64xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 64], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x112x112x64xf32>> -> tensor<1x112x112x64xf32>
- %padded = tensor.pad %2 low[0, 1, 1, 0] high[0, 1, 1, 0] {
- ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
- tensor.yield %cst : f32
- } : tensor<1x112x112x64xf32> to tensor<1x114x114x64xf32>
- flow.dispatch.tensor.store %padded, %1, offsets = [0, 0, 0, 0], sizes = [1, 114, 114, 64], strides = [1, 1, 1, 1]
- : tensor<1x114x114x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x114x114x64xf32>>
- return
- }
- }
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @pad_only() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c634816 = arith.constant 634816 : index
+ %c3846080 = arith.constant 3846080 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c634816) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x112x112x64xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c3846080) : !flow.dispatch.tensor<writeonly:tensor<1x114x114x64xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x112x112x64xf32>> -> tensor<1x112x112x64xf32>
+ %padded = tensor.pad %2 low[0, 1, 1, 0] high[0, 1, 1, 0] {
+ ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
+ tensor.yield %cst : f32
+ } : tensor<1x112x112x64xf32> to tensor<1x114x114x64xf32>
+ flow.dispatch.tensor.store %padded, %1, offsets = [0, 0, 0, 0], sizes = [1, 114, 114, 64], strides = [1, 1, 1, 1] : tensor<1x114x114x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x114x114x64xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 19, 19, 64], [1, 1, 1, 4], [0, 0, 0, 0], [0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-// CHECK: hal.executable.export public @pad_only
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: func.func @pad_only()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: tensor.pad {{.+}} {
// CHECK: tensor.yield
// CHECK-NEXT: } {lowering_config = #[[CONFIG]]}
// -----
-hal.executable private @attention {
- hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
cpu = "generic", cpu_features = "",
data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
- native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>) {
- hal.executable.export public @attention ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>, #hal.interface.binding<0, 3>]} {
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @attention() {
- %c0 = arith.constant 0 : index
- %scale = arith.constant 0.125 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>
- %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16>
- %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16>
- %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16>
- %7 = tensor.empty() : tensor<20x4096x64xf16>
- %8 = iree_linalg_ext.attention
- ins(%4, %5, %6, %scale : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16)
- outs(%7 : tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16>
- flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>
- return
- }
+ native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
+module {
+ func.func @attention() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %scale = arith.constant 0.125 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<20x4096x64xf16>> -> tensor<20x4096x64xf16>
+ %7 = tensor.empty() : tensor<20x4096x64xf16>
+ %8 = iree_linalg_ext.attention
+ ins(%4, %5, %6, %scale : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16)
+ outs(%7 : tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16>
+ flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<20x4096x64xf16>>
+ return
}
}
-}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[20, 64], [1, 1]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPULinalgExtTileAndVectorize>
-// CHECK: hal.executable.export public @attention
+// CHECK: func.func @attention()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK: func.func @attention()
// CHECK: iree_linalg_ext.attention
// CHECK-SAME: {lowering_config = #[[CONFIG]]}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir
index 2adc175..6264287 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir
@@ -1,45 +1,35 @@
-// RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule | FileCheck %s
-
+// RUN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule | FileCheck %s
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}>
-#device_target_cpu = #hal.device.target<"llvm-cpu", [#executable_target_embedded_elf_x86_64_]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
+func.func @pad_matmul_static_dispatch_0() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<250x500xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<500x1020xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<250x1020xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [250, 500], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<250x500xf32>> -> tensor<250x500xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [500, 1020], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<500x1020xf32>> -> tensor<500x1020xf32>
+ %5 = tensor.empty() : tensor<250x1020xf32>
+ %cst = arith.constant 0.000000e+00 : f32
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<250x1020xf32>) -> tensor<250x1020xf32>
-hal.executable private @pad_matmul_static_dispatch_0 {
- hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
- hal.executable.export public @pad_matmul_static_dispatch_0 ordinal(0) layout(#pipeline_layout)
- builtin.module {
- func.func @pad_matmul_static_dispatch_0() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<250x500xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<500x1020xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<250x1020xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [250, 500], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<250x500xf32>> -> tensor<250x500xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [500, 1020], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<500x1020xf32>> -> tensor<500x1020xf32>
+ // CHECK: memref.assume_alignment %{{.*}}, 64 : memref<250x1020xf32, #hal.descriptor_type<storage_buffer>>
+ // CHECK-NEXT: linalg.fill ins(%{{.*}} : f32) outs(%{{.*}} : memref<250x1020xf32, #hal.descriptor_type<storage_buffer>>)
+ // CHECK-NEXT: linalg.matmul
+ // CHECK-SAME: ins(%{{.*}} : memref<250x500xf32, #hal.descriptor_type<storage_buffer>>, memref<500x1020xf32, #hal.descriptor_type<storage_buffer>>)
+ // CHECK-SAME: outs(%{{.*}} : memref<250x1020xf32, #hal.descriptor_type<storage_buffer>>)
+ // CHECK-NEXT: return
- %50 = tensor.empty() : tensor<250x1020xf32>
- %cst = arith.constant 0.000000e+00 : f32
- %5 = linalg.fill ins(%cst : f32) outs(%50 : tensor<250x1020xf32>) -> tensor<250x1020xf32>
-
- // CHECK: memref.assume_alignment %{{.*}}, 64 : memref<250x1020xf32, #hal.descriptor_type<storage_buffer>>
- // CHECK-NEXT: linalg.fill ins(%{{.*}} : f32) outs(%{{.*}} : memref<250x1020xf32, #hal.descriptor_type<storage_buffer>>)
- // CHECK-NEXT: linalg.matmul
- // CHECK-SAME: ins(%{{.*}} : memref<250x500xf32, #hal.descriptor_type<storage_buffer>>, memref<500x1020xf32, #hal.descriptor_type<storage_buffer>>)
- // CHECK-SAME: outs(%{{.*}} : memref<250x1020xf32, #hal.descriptor_type<storage_buffer>>)
- // CHECK-NEXT: return
-
- %6 = linalg.matmul ins(%3, %4 : tensor<250x500xf32>, tensor<500x1020xf32>) outs(%5 : tensor<250x1020xf32>) -> tensor<250x1020xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [250, 1020], strides = [1, 1] : tensor<250x1020xf32> -> !flow.dispatch.tensor<readwrite:tensor<250x1020xf32>>
- return
- }
- }
- }
+ %7 = linalg.matmul ins(%3, %4 : tensor<250x500xf32>, tensor<500x1020xf32>) outs(%6 : tensor<250x1020xf32>) -> tensor<250x1020xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [250, 1020], strides = [1, 1] : tensor<250x1020xf32> -> !flow.dispatch.tensor<readwrite:tensor<250x1020xf32>>
+ return
}
-module attributes { transform.with_named_sequence } {
- transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.consumed}) {
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
- %func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.yield
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+ %8 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.print %8 : !transform.any_op
+ transform.iree.eliminate_empty_tensors %8 : (!transform.any_op) -> ()
+ %9 = transform.iree.bufferize %8 : (!transform.any_op) -> !transform.any_op
+ // %9 = transform.structured.match ops{["func.func"]} in %8 : (!transform.any_op) -> !transform.any_op
+ transform.yield
}
-} // module
+}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_lowering.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_lowering.mlir
index cb41730..2906861 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_lowering.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_lowering.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --iree-codegen-llvmcpu-vector-lowering-pipeline --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-llvmcpu-vector-lowering-pipeline))" --split-input-file %s | FileCheck %s
func.func @matmul_391x384x384_f32() {
%cst = arith.constant 0.000000e+00 : f32
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/verify_linalg_transform_legality.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/verify_linalg_transform_legality.mlir
index ee0165f..c6ee662 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/verify_linalg_transform_legality.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/verify_linalg_transform_legality.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --iree-llvmcpu-verify-linalg-transform-legality %s --verify-diagnostics -split-input-file
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-verify-linalg-transform-legality))" %s --verify-diagnostics -split-input-file
func.func @matmul_123x456xf32_times_456x789xf32_into_123x789xf32_dispatch_0() {
%cst = arith.constant 0.000000e+00 : f32
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
index 1e6e4eb..af43402 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
@@ -142,6 +142,7 @@
"//compiler/src/iree/compiler/Dialect/LinalgExt/Transforms",
"//compiler/src/iree/compiler/Dialect/Util/IR",
"//compiler/src/iree/compiler/Dialect/Util/Transforms",
+ "//compiler/src/iree/compiler/Utils",
"//llvm-external-projects/iree-dialects:IREELinalgTransformDialect",
"//llvm-external-projects/iree-dialects:IREELinalgTransformDialectPasses",
"//llvm-external-projects/iree-dialects:IREEVectorExtDialect",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
index bd99273..54b5ffd 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
@@ -184,6 +184,7 @@
iree::compiler::Dialect::LinalgExt::Transforms
iree::compiler::Dialect::Util::IR
iree::compiler::Dialect::Util::Transforms
+ iree::compiler::Utils
PUBLIC
)
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
index e99f739..e2fa087 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
@@ -42,14 +42,12 @@
/// Return the CUDA capability of the gpu. Assumes CUDA capability is 80 (sm_80)
/// if not specified.
-static int getCUDACapbility(Operation *op) {
- FailureOr<IREE::HAL::ExecutableVariantOp> variantOp =
- getExecutableVariantOp(op);
- if (failed(variantOp)) {
- return kDefaultCUDACapability;
+static std::optional<int> getCUDACapbility(Operation *op) {
+ auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(op);
+ if (!targetAttr) {
+ return std::nullopt;
}
- auto targetAttr = variantOp->getTargetAttr();
if (auto config = targetAttr.getConfiguration()) {
if (auto attr = config.getAs<StringAttr>("target_arch")) {
StringRef targetName = attr.getValue();
@@ -145,7 +143,8 @@
// is faulty for them.
// TODO: Remove this once the lowering in LLVM is fixed
// (https://github.com/llvm/llvm-project/issues/64606).
- if (getCUDACapbility(m) < 80) {
+ std::optional<int> cudaCapability = getCUDACapbility(m);
+ if (!cudaCapability || cudaCapability.value() < 80) {
RewritePatternSet patterns(&getContext());
populateReplaceSlowMinMaxOpsPatterns(patterns);
if (failed(applyPatternsAndFoldGreedily(m, std::move(patterns)))) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
index 41cdbfe..17fccde 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
@@ -49,14 +49,10 @@
namespace {
-// HACK: this is not the proper way to do this; each function may have a
-// locally-scoped arch in cases of multi-versioning and randomly picking any
-// function is going to produce bad results.
-
+/// Return the target arch attached the most immediate parent.
static StringRef getTargetArch(mlir::FunctionOpInterface entryPoint) {
- if (auto variantOp =
- entryPoint->getParentOfType<IREE::HAL::ExecutableVariantOp>()) {
- IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.getTarget();
+ auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(entryPoint);
+ if (targetAttr) {
if (auto config = targetAttr.getConfiguration()) {
if (auto attr = config.getAs<StringAttr>("target_arch")) {
return attr.getValue();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index fa43297..c9b7a1e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -157,10 +157,9 @@
}
}
+// Get the target arch associated with the immediate parent.
static StringRef getTargetArch(mlir::FunctionOpInterface entryPoint) {
- if (auto variantOp =
- entryPoint->getParentOfType<IREE::HAL::ExecutableVariantOp>()) {
- IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.getTarget();
+ if (auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(entryPoint)) {
if (auto config = targetAttr.getConfiguration()) {
if (auto attr = config.getAs<StringAttr>("target_arch")) {
return attr.getValue();
@@ -171,9 +170,7 @@
}
bool isCudaTarget(mlir::FunctionOpInterface entryPoint) {
- if (auto variantOp =
- entryPoint->getParentOfType<IREE::HAL::ExecutableVariantOp>()) {
- IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.getTarget();
+ if (auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(entryPoint)) {
if (auto backend = targetAttr.getBackend()) {
return backend.getValue().str() == kCudaTarget;
}
@@ -182,9 +179,7 @@
}
bool isRocmTarget(mlir::FunctionOpInterface entryPoint) {
- if (auto variantOp =
- entryPoint->getParentOfType<IREE::HAL::ExecutableVariantOp>()) {
- IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.getTarget();
+ if (auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(entryPoint)) {
if (auto backend = targetAttr.getBackend()) {
return backend.getValue().str() == kRocmTarget;
}
@@ -1492,9 +1487,7 @@
setArgmaxUkernelConfig(mlir::FunctionOpInterface entryPoint,
linalg::GenericOp op, const TargetInfo &targetInfo) {
// Checks if UKernels are enabled.
- if (auto variantOp =
- entryPoint->getParentOfType<IREE::HAL::ExecutableVariantOp>()) {
- auto target = variantOp.getTarget();
+ if (auto target = IREE::HAL::ExecutableTargetAttr::lookup(entryPoint)) {
const char ukernelName[] = "argmax";
if (!hasUkernel(target, ukernelName) ||
!hasUkernelSupportedGpuArch(target)) {
@@ -1816,95 +1809,85 @@
//===----------------------------------------------------------------------===//
// Entry Point
//===----------------------------------------------------------------------===//
+LogicalResult initGPULaunchConfig(FunctionOpInterface funcOp) {
-LogicalResult initGPULaunchConfig(ModuleOp moduleOp) {
- llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps =
- getAllEntryPoints(moduleOp);
-
- for (auto funcOp : moduleOp.getOps<mlir::FunctionOpInterface>()) {
- auto exportOp = exportOps.lookup(funcOp.getName());
- if (!exportOp)
- continue;
-
- if (!getTranslationInfo(funcOp)) {
- // If no translation info set, first check whether we already have
- // workgroup count set--it's a "contract" to indicate that we should
- // bypass all tiling and distribution to go down just the most basic
- // lowering flow.
- if (Block *body = exportOp.getWorkgroupCountBody()) {
- auto retOp = cast<IREE::HAL::ReturnOp>(body->getTerminator());
- // For scalar dispatch cases--using just one thread of one workgroup.
- auto isOne = [](Value value) { return matchPattern(value, m_One()); };
- if (llvm::all_of(retOp.getOperands(), isOne)) {
- std::array<int64_t, 3> workgroupSize = {1, 1, 1};
- if (failed(setDispatchConfig(funcOp, workgroupSize, std::nullopt)))
- return failure();
- auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
- funcOp.getContext(),
- IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUBaseLowering);
- if (failed(setTranslationInfo(funcOp, translationInfo))) {
- return failure();
- }
- continue;
+ auto exportOp = getEntryPoint(funcOp);
+ if (!getTranslationInfo(funcOp) && exportOp) {
+ // If no translation info set, first check whether we already have
+ // workgroup count set--it's a "contract" to indicate that we should
+ // bypass all tiling and distribution to go down just the most basic
+ // lowering flow.
+ if (Block *body = exportOp->getWorkgroupCountBody()) {
+ auto retOp = cast<IREE::HAL::ReturnOp>(body->getTerminator());
+ // For scalar dispatch cases--using just one thread of one workgroup.
+ auto isOne = [](Value value) { return matchPattern(value, m_One()); };
+ if (llvm::all_of(retOp.getOperands(), isOne)) {
+ SmallVector<int64_t, 3> workgroupSize = {1, 1, 1};
+ auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
+ funcOp.getContext(),
+ IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUBaseLowering,
+ workgroupSize);
+ if (failed(setTranslationInfo(funcOp, translationInfo))) {
+ return failure();
}
+ return success();
}
}
+ }
- SmallVector<Operation *> computeOps = getComputeOps(funcOp);
- if (getTranslationInfo(exportOp)) {
- // Currently LLVMGPU requires propagation of user lowering configs.
- for (auto op : computeOps) {
- if (getLoweringConfig(op)) {
- propagateLoweringConfig(op, computeOps);
- break;
- }
+ SmallVector<Operation *> computeOps = getComputeOps(funcOp);
+ if (getTranslationInfo(funcOp)) {
+ // Currently LLVMGPU requires propagation of user lowering configs.
+ for (auto op : computeOps) {
+ if (getLoweringConfig(op)) {
+ propagateLoweringConfig(op, computeOps);
+ break;
}
- continue;
}
+ return success();
+ }
- Operation *rootOperation = nullptr;
+ Operation *rootOperation = nullptr;
- // Find the root operation. linalg.generic and linalg.fill are not root
- // operations if there are other compute operations present.
- for (Operation *op : llvm::reverse(computeOps)) {
- if (!isa<linalg::GenericOp, linalg::FillOp>(op)) {
+ // Find the root operation. linalg.generic and linalg.fill are not root
+ // operations if there are other compute operations present.
+ for (Operation *op : llvm::reverse(computeOps)) {
+ if (!isa<linalg::GenericOp, linalg::FillOp>(op)) {
+ rootOperation = op;
+ break;
+ }
+ if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) {
+ // linalg.generic with `reduction` iterator types are roots as well.
+ if (genericOp.getNumLoops() != genericOp.getNumParallelLoops()) {
rootOperation = op;
break;
}
- if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) {
- // linalg.generic with `reduction` iterator types are roots as well.
- if (genericOp.getNumLoops() != genericOp.getNumParallelLoops()) {
- rootOperation = op;
- break;
- }
- }
}
-
- if (!rootOperation) {
- for (Operation *op : llvm::reverse(computeOps)) {
- if (isa<linalg::GenericOp, linalg::FillOp>(op)) {
- rootOperation = op;
- break;
- }
- }
- }
-
- if (!rootOperation) {
- // No root operation found, set it to none.
- auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
- funcOp.getContext(),
- IREE::Codegen::DispatchLoweringPassPipeline::None);
- if (failed(setTranslationInfo(funcOp, translationInfo))) {
- return failure();
- }
- continue;
- }
-
- if (failed(setRootConfig(funcOp, rootOperation)))
- continue;
-
- propagateLoweringConfig(rootOperation, computeOps);
}
+
+ if (!rootOperation) {
+ for (Operation *op : llvm::reverse(computeOps)) {
+ if (isa<linalg::GenericOp, linalg::FillOp>(op)) {
+ rootOperation = op;
+ break;
+ }
+ }
+ }
+
+ if (!rootOperation) {
+ // No root operation found, set it to none.
+ auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
+ funcOp.getContext(), IREE::Codegen::DispatchLoweringPassPipeline::None);
+ if (failed(setTranslationInfo(funcOp, translationInfo))) {
+ return failure();
+ }
+ return success();
+ }
+
+ if (failed(setRootConfig(funcOp, rootOperation)))
+ return funcOp.emitOpError("failed to set root config");
+
+ propagateLoweringConfig(rootOperation, computeOps);
return success();
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.h
index 1cca718..a77a3a0 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.h
@@ -16,7 +16,7 @@
// the lowering configuration.
int64_t getTargetSharedMemoryLimitInBytes(FunctionOpInterface entryPoint);
-LogicalResult initGPULaunchConfig(ModuleOp moduleOp);
+LogicalResult initGPULaunchConfig(FunctionOpInterface funcOp);
} // namespace mlir::iree_compiler
#endif // IREE_COMPILER_CODEGEN_LLVMGPU_KERNELCONFIG_H_
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
index 82a98e0..27f8068 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
@@ -5,6 +5,7 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree-dialects/Dialect/VectorExt/IR/VectorExtDialect.h"
+#include "iree/compiler/Codegen/Common/PassUtils.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
#include "iree/compiler/Codegen/LLVMGPU/KernelConfig.h"
@@ -13,6 +14,7 @@
#include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/PDL/IR/PDL.h"
@@ -66,20 +68,25 @@
} // namespace
void LLVMGPULowerExecutableTargetPass::runOnOperation() {
- IREE::HAL::ExecutableVariantOp variantOp = getOperation();
+ auto funcOp = getOperation();
+ auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
+ bool enableMicrokernels = targetAttr && hasUkernel(targetAttr);
- std::optional<IREE::Codegen::TranslationInfoAttr> translationInfo =
- getIdenticalTranslationInfo(variantOp);
- if (!translationInfo) {
- variantOp.emitOpError(
- "unhandled compilation of entry point functions with different "
- "translation info");
+ IREE::Codegen::TranslationInfoAttr translationInfo =
+ getTranslationInfo(funcOp);
+ if (!translationInfo)
+ return;
+
+ std::optional<OpPassManager> maybePipeline =
+ getFunctionOpInterfacePassManager(funcOp);
+ if (!maybePipeline) {
+ funcOp.emitOpError(
+ "unhandled function-like container during executable lowering");
return signalPassFailure();
}
+ OpPassManager &pipeline = maybePipeline.value();
- bool enableMicrokernels = hasUkernel(variantOp.getTarget());
- OpPassManager pipeline(IREE::HAL::ExecutableVariantOp::getOperationName());
- switch (translationInfo.value().getDispatchLoweringPassPipeline()) {
+ switch (translationInfo.getDispatchLoweringPassPipeline()) {
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUDefault:
addGPUDefaultPassPipeline(pipeline, enableMicrokernels);
break;
@@ -97,9 +104,9 @@
break;
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUMatmulTensorCore: {
FailureOr<int64_t> maybeDepth =
- getSoftwarePipelineDepth(translationInfo.value().getConfiguration());
+ getSoftwarePipelineDepth(translationInfo.getConfiguration());
if (failed(maybeDepth)) {
- variantOp.emitOpError(
+ funcOp.emitOpError(
"invalid matmul configuration without software pipelining config");
return signalPassFailure();
}
@@ -109,9 +116,9 @@
case IREE::Codegen::DispatchLoweringPassPipeline::
LLVMGPUMatmulTensorCoreMmaSync: {
FailureOr<int64_t> maybeDepth =
- getSoftwarePipelineDepth(translationInfo.value().getConfiguration());
+ getSoftwarePipelineDepth(translationInfo.getConfiguration());
if (failed(maybeDepth)) {
- variantOp.emitOpError(
+ funcOp.emitOpError(
"invalid matmul configuration without software pipelining config");
return signalPassFailure();
}
@@ -130,27 +137,20 @@
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUPackUnPack:
addGPUPackUnPackPasses(pipeline);
break;
- // Transform-dialect pipelines.
- case IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen: {
- SymbolRefAttr codegenSpec = translationInfo.value().getCodegenSpec();
- addGPUTransformDialectPasses(
- pipeline, codegenSpec ? codegenSpec.getLeafReference() : StringRef(""));
- break;
- }
// no pipeline specified, nothing to do.
case IREE::Codegen::DispatchLoweringPassPipeline::None:
return;
default:
- variantOp.emitOpError("unsupported pipeline on GPU target.");
+ funcOp.emitOpError("unsupported pipeline on GPU target.");
return signalPassFailure();
}
- if (failed(runPipeline(pipeline, variantOp))) {
+ if (failed(runPipeline(pipeline, funcOp))) {
return signalPassFailure();
}
}
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPULowerExecutableTargetPass() {
return std::make_unique<LLVMGPULowerExecutableTargetPass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp
index ba7ccce..ff6538d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp
@@ -66,10 +66,10 @@
/// module.
template <typename F>
static LogicalResult
-verifyLoweringConfiguration(ModuleOp module,
+verifyLoweringConfiguration(FunctionOpInterface funcOp,
IREE::Codegen::TranslationInfoAttr translationInfo,
ArrayRef<int64_t> workgroupSize, F verificationFn) {
- auto walkResult = module.walk([&](Operation *op) -> WalkResult {
+ auto walkResult = funcOp.walk([&](Operation *op) -> WalkResult {
IREE::Codegen::LoweringConfigAttr loweringConfig = getLoweringConfig(op);
if (!loweringConfig)
return WalkResult::advance();
@@ -79,49 +79,41 @@
}
static LogicalResult
-verifyEntryPoint(ModuleOp moduleOp,
- IREE::Codegen::TranslationInfoAttr translationInfo,
- IREE::HAL::ExecutableExportOp exportOp) {
- std::optional<mlir::ArrayAttr> workgroupSizeAttr =
- exportOp.getWorkgroupSize();
-
- if (workgroupSizeAttr.has_value()) {
- std::array<int64_t, 3> workgroupSizes;
- for (auto [index, attr] : llvm::enumerate(workgroupSizeAttr.value())) {
- workgroupSizes[index] = llvm::cast<IntegerAttr>(attr).getInt();
- }
- return verifyLoweringConfiguration(moduleOp, translationInfo,
- workgroupSizes, verifyGPUMatmulPipeline);
+verifyEntryPoint(FunctionOpInterface funcOp,
+ IREE::Codegen::TranslationInfoAttr translationInfo) {
+ std::optional<SmallVector<int64_t>> workgroupSize = getWorkgroupSize(funcOp);
+ if (!workgroupSize) {
+ return funcOp->emitOpError(
+ "failed to get workgroup size needed for verification");
}
+
+ return verifyLoweringConfiguration(
+ funcOp, translationInfo, workgroupSize.value(), verifyGPUMatmulPipeline);
return success();
}
void LLVMGPUSelectLoweringStrategyPass::runOnOperation() {
- IREE::HAL::ExecutableVariantOp variantOp = getOperation();
- ModuleOp moduleOp = variantOp.getInnerModule();
+ auto moduleOp = getOperation();
+ for (auto funcOp : moduleOp.getOps<FunctionOpInterface>()) {
+ if (failed(initGPULaunchConfig(funcOp))) {
+ return signalPassFailure();
+ }
- if (failed(initGPULaunchConfig(moduleOp))) {
- return signalPassFailure();
- }
+ IREE::Codegen::TranslationInfoAttr translationInfo =
+ getTranslationInfo(funcOp);
+ if (!translationInfo) {
+ // Dont do anything if translation info is not set.
+ return;
+ }
- std::optional<IREE::Codegen::TranslationInfoAttr> translationInfo =
- getIdenticalTranslationInfo(variantOp);
- if (!translationInfo) {
- moduleOp.emitOpError(
- "unhandled compilation of entry point functions with different "
- "translation info");
- return signalPassFailure();
- }
-
- // Verify the properties of each entry point based on the target pipeline.
- for (auto exportOp : variantOp.getExportOps()) {
- if (failed(verifyEntryPoint(moduleOp, translationInfo.value(), exportOp))) {
+ // Verify the properties of each entry point based on the target pipeline.
+ if (failed(verifyEntryPoint(funcOp, translationInfo))) {
return signalPassFailure();
}
}
}
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
createLLVMGPUSelectLoweringStrategyPass() {
return std::make_unique<LLVMGPUSelectLoweringStrategyPass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index d9b0b99..ec98dec 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -214,8 +214,6 @@
void runOnOperation() override {
MLIRContext *context = &getContext();
auto funcOp = getOperation();
- if (!isEntryPoint(funcOp))
- return;
// Promote C matrix and propagate the potential fill producer into the temp
// allocation. This needs to be done before reduction tiling.
@@ -241,10 +239,15 @@
funcOp.dump();
});
- auto workgroupSize = llvm::map_to_vector(
- getEntryPoint(funcOp)->getWorkgroupSize().value(),
- [&](Attribute attr) { return llvm::cast<IntegerAttr>(attr).getInt(); });
+ std::optional<SmallVector<int64_t>> maybeWorkgroupSize =
+ getWorkgroupSize(funcOp);
+ if (!maybeWorkgroupSize) {
+ funcOp.emitOpError("expected workgroup size to be set on the lowering "
+ "config for the function");
+ return signalPassFailure();
+ }
+ SmallVector<int64_t> workgroupSize = maybeWorkgroupSize.value();
int64_t flatWorkgroupSize =
workgroupSize[0] * workgroupSize[1] * workgroupSize[2];
// Only promote to workgroup size if there are multiple warps.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp
index 584ba85..7c11d25 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp
@@ -362,7 +362,17 @@
workgroupSize[i] = llvm::cast<IntegerAttr>(size).getInt();
}
} else {
- workgroupSize = getWorkgroupSize(func);
+ std::optional<SmallVector<int64_t>> maybeWorkgroupSize =
+ getWorkgroupSize(func);
+ if (!maybeWorkgroupSize) {
+ return;
+ }
+ for (auto [index, value] : llvm::enumerate(maybeWorkgroupSize.value())) {
+ workgroupSize[index] = value;
+ }
+ for (auto index : llvm::seq<size_t>(maybeWorkgroupSize->size(), 3)) {
+ workgroupSize[index] = 1;
+ }
}
llvm::StringLiteral scheduleAttrName =
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index e871753..fa52c5e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -21,6 +21,7 @@
#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
#include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
#include "iree/compiler/Dialect/Util/Transforms/Passes.h"
+#include "iree/compiler/Utils/PassUtils.h"
#include "llvm/Support/CommandLine.h"
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
#include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
@@ -146,11 +147,10 @@
// Returns success when workgroup reordering is supported for `funcOp`.
// On ROCm, we require workgroup counts to be static.
static LogicalResult canReorderWorkgroups(FunctionOpInterface funcOp) {
- auto variantOp = getExecutableVariantOp(funcOp);
- if (failed(variantOp))
+ auto target = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
+ if (!target) {
return failure();
-
- IREE::HAL::ExecutableTargetAttr target = variantOp->getTarget();
+ }
if (target.getBackend() != "rocm")
return success();
@@ -169,334 +169,287 @@
// Common Pass Recipes
//===----------------------------------------------------------------------===//
-static void addBufferizePasses(OpPassManager &passManager) {
+static void addBufferizePasses(OpPassManager &funcPassManager) {
BufferizationOptions::AllocationFn allocationFn = gpuAllocationFn;
BufferizationOptions::MemCpyFn memcpyFn = gpuCopyFn;
- addIREEComprehensiveBufferizePasses(passManager, allocationFn, memcpyFn);
- passManager.addPass(createCanonicalizerPass());
- passManager.addPass(createCSEPass());
+ addIREEComprehensiveBufferizePasses(funcPassManager, allocationFn, memcpyFn);
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
static void
-tileAndDistributeToWorkgroup(OpPassManager &pm,
+tileAndDistributeToWorkgroup(OpPassManager &funcPassManager,
bool useWARForCooperativeMatrixCodegen = false) {
- pm.addPass(createTileAndDistributeToWorkgroupsPass(
+ funcPassManager.addPass(createTileAndDistributeToWorkgroupsPass(
kNumMaxParallelDims,
linalg::DistributionMethod::CyclicNumProcsEqNumIters));
- auto &nestedModulePM = pm.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(
- createConvertToDestinationPassingStylePass(
- useWARForCooperativeMatrixCodegen));
+ funcPassManager.addPass(createConvertToDestinationPassingStylePass(
+ useWARForCooperativeMatrixCodegen));
// TODO(#16421): Disable decomposition due to failure in bufferization.
- // nestedModulePM.addNestedPass<func::FuncOp>(
+ // funcPassManager.addPass(
// IREE::LinalgExt::createTileAndDecomposeAttentionPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
-static void tileAndBufferize(OpPassManager &pm) {
- tileAndDistributeToWorkgroup(pm, /*useWARForCooperativeMatrixCodegen=*/true);
-
- auto &nestedModulePM = pm.nest<ModuleOp>();
- addBufferizePasses(nestedModulePM);
+static void tileAndBufferize(OpPassManager &funcPassManager) {
+ tileAndDistributeToWorkgroup(funcPassManager,
+ /*useWARForCooperativeMatrixCodegen=*/true);
+ addBufferizePasses(funcPassManager);
}
-static void addGPUVectorizationPasses(OpPassManager &pm) {
- pm.addNestedPass<func::FuncOp>(createDecomposeConvolutionToLowerDimOpsPass());
+static void addGPUVectorizationPasses(OpPassManager &funcPassManager) {
+ funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass());
GenericVectorizationPassOptions options;
options.vectorizePadding = true;
options.vectorizeGatherAccesses = true;
options.enableCleanup = false;
options.foldCastIntoContract = true;
- pm.addNestedPass<func::FuncOp>(createGenericVectorizationPass(options));
- pm.addNestedPass<func::FuncOp>(createOptimizeTensorInsertExtractSlicesPass());
- pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- pm.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createGenericVectorizationPass(options));
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
//===---------------------------------------------------------------------===//
// Default Vectorization
//===---------------------------------------------------------------------===//
-void addGPUVectorizationPassPipeline(OpPassManager &pm) {
- tileAndDistributeToWorkgroup(pm);
+void addGPUVectorizationPassPipeline(OpPassManager &funcPassManager) {
+ tileAndDistributeToWorkgroup(funcPassManager);
- auto &nestedModulePM = pm.nest<ModuleOp>();
-
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createWorkgroupSpecializationPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createWorkgroupSpecializationPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Distribute linalg onto threads within the workgroup.
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUTensorTile(false));
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createGPUTensorTile(false));
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Linalg -> vector
- addGPUVectorizationPasses(nestedModulePM);
+ addGPUVectorizationPasses(funcPassManager);
// tensor to memref
- addBufferizePasses(nestedModulePM);
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUDistribute());
+ addBufferizePasses(funcPassManager);
+ funcPassManager.addPass(createGPUDistribute());
// Post bufferization optimizations.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLoopInvariantCodeMotionPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- memref::createFoldMemRefAliasOpsPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeVectorTransferPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createLoopInvariantCodeMotionPass());
+ funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createOptimizeVectorTransferPass());
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
}
//===---------------------------------------------------------------------===//
// MatmulSIMT
//===---------------------------------------------------------------------===//
-void addGPUMatmulSimtPassPipeline(OpPassManager &pm) {
- tileAndDistributeToWorkgroup(pm);
- auto &nestedModulePM = pm.nest<ModuleOp>();
+void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager) {
+ tileAndDistributeToWorkgroup(funcPassManager);
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createWorkgroupSpecializationPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createWorkgroupSpecializationPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGPUTensorTileToSerialLoops());
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUTensorAlloc());
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUTensorTile(false));
+ funcPassManager.addPass(createGPUTensorTileToSerialLoops());
+ funcPassManager.addPass(createGPUTensorAlloc());
+ funcPassManager.addPass(createGPUTensorTile(false));
// Linalg -> vector
- addGPUVectorizationPasses(nestedModulePM);
+ addGPUVectorizationPasses(funcPassManager);
// tensor to memref
- addBufferizePasses(nestedModulePM);
+ addBufferizePasses(funcPassManager);
// distribute foreach threads
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUDistribute());
+ funcPassManager.addPass(createGPUDistribute());
- nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGPUDistributeSharedMemoryCopy());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createMemrefCopyToLinalgPass());
+ funcPassManager.addPass(createGPUDistributeSharedMemoryCopy());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGPUReduceSharedMemoryBankConflicts());
- nestedModulePM.addNestedPass<func::FuncOp>(createReorderWorkgroups(
+ funcPassManager.addPass(createGPUReduceSharedMemoryBankConflicts());
+ funcPassManager.addPass(createReorderWorkgroups(
clReorderWorkgroupsStrategy, clReorderWorkgroupsLogSwizzleTile,
canReorderWorkgroups));
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- memref::createFoldMemRefAliasOpsPass());
- nestedModulePM.addPass(createCSEPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Even though we vectorize before bufferization we are not able to hoist
// accumulator load/store out of the K loop until distribution. Therefore we
// still rely on buffer level transformations for transfer ops hoisting and
// store to load forwarding. This relies on shacky alias analysis and we need
// to move this to tensor level once we have better abstractions.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
// Hoist loop invariant code to avoid pipelining it.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLoopInvariantCodeMotionPass());
+ funcPassManager.addPass(createLoopInvariantCodeMotionPass());
// Pipeline memory operations.
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUPipeliningPass());
+ funcPassManager.addPass(createGPUPipeliningPass());
}
//===---------------------------------------------------------------------===//
// Matmul Tensor Core
//===---------------------------------------------------------------------===//
-void addGPUMatmulTensorCorePassPipeline(OpPassManager &pm,
+void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
unsigned pipelineDepth) {
- tileAndBufferize(pm);
+ tileAndBufferize(funcPassManager);
- auto &nestedModulePM = pm.nest<ModuleOp>();
// Distribute linalg onto warps within the workgroup.
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(
createLLVMGPUTileAndDistribute(/*distributeToWarp=*/true));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRemoveSingleIterationLoopPass());
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
if (pipelineDepth > 1)
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGPUMultiBuffering(pipelineDepth));
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createGPUMultiBuffering(pipelineDepth));
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRemoveSingleIterationLoopPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createReorderWorkgroups(
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
+ funcPassManager.addPass(createReorderWorkgroups(
clReorderWorkgroupsStrategy, clReorderWorkgroupsLogSwizzleTile,
canReorderWorkgroups));
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Linalg -> vector
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLLVMGPUTensorCoreVectorizationPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- memref::createFoldMemRefAliasOpsPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeVectorTransferPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createLLVMGPUTensorCoreVectorizationPass());
+ funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createOptimizeVectorTransferPass());
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
// Distribute shared memory copies.
- nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGPUDistributeSharedMemoryCopy());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGPUReduceSharedMemoryBankConflicts());
+ funcPassManager.addPass(createMemrefCopyToLinalgPass());
+ funcPassManager.addPass(createGPUDistributeSharedMemoryCopy());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createGPUReduceSharedMemoryBankConflicts());
// Vector -> MMA ops
- nestedModulePM.addNestedPass<func::FuncOp>(
- memref::createFoldMemRefAliasOpsPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUVectorToGPU());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createLLVMGPUVectorToGPU());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Hoist loop invariant code to avoid pipelining it.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLoopInvariantCodeMotionPass());
+ funcPassManager.addPass(createLoopInvariantCodeMotionPass());
// Pipeline memory operations.
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUPipeliningPass(
+ funcPassManager.addPass(createGPUPipeliningPass(
/*epiloguePeeling=*/false, pipelineDepth,
PipeliningSchedulingStrategy::loadGlobalStage0));
// Optimize shared memory usage.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLLVMGPUPackSharedMemoryAlloc());
+ funcPassManager.addPass(createLLVMGPUPackSharedMemoryAlloc());
}
//===---------------------------------------------------------------------===//
// Matmul MMA.Sync
//===---------------------------------------------------------------------===//
-void addGPUMatmulTensorCoreMmaSyncPassPipeline(OpPassManager &pm,
+void addGPUMatmulTensorCoreMmaSyncPassPipeline(OpPassManager &funcPassManager,
unsigned pipelineDepth) {
- tileAndBufferize(pm);
+ tileAndBufferize(funcPassManager);
- auto &nestedModulePM = pm.nest<ModuleOp>();
// Distribute linalg onto warps within the workgroup.
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(
createLLVMGPUTileAndDistribute(/*distributeToWarp=*/true));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRemoveSingleIterationLoopPass());
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
if (pipelineDepth > 1)
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGPUMultiBuffering(pipelineDepth));
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createGPUMultiBuffering(pipelineDepth));
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRemoveSingleIterationLoopPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createReorderWorkgroups(
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
+ funcPassManager.addPass(createReorderWorkgroups(
clReorderWorkgroupsStrategy, clReorderWorkgroupsLogSwizzleTile,
canReorderWorkgroups));
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Linalg -> vector
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(
createLLVMGPUTensorCoreVectorizationPass(GPUTensorCoreType::MMA_SYNC));
- nestedModulePM.addNestedPass<func::FuncOp>(
- memref::createFoldMemRefAliasOpsPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeVectorTransferPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createOptimizeVectorTransferPass());
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
// Distribute shared memory copies.
- nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGPUDistributeSharedMemoryCopy());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createMemrefCopyToLinalgPass());
+ funcPassManager.addPass(createGPUDistributeSharedMemoryCopy());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Vector -> MMA ops
- nestedModulePM.addNestedPass<func::FuncOp>(
- memref::createFoldMemRefAliasOpsPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(
createLLVMGPUVectorToGPU(GPUTensorCoreType::MMA_SYNC));
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Hoist loop invariant code to avoid pipelining it.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLoopInvariantCodeMotionPass());
+ funcPassManager.addPass(createLoopInvariantCodeMotionPass());
// Pipeline memory operations.
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUPipeliningPass(
+ funcPassManager.addPass(createGPUPipeliningPass(
/*epiloguePeeling=*/false, pipelineDepth,
PipeliningSchedulingStrategy::nvidiaTensorCore));
// Optimize shared memory usage.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLLVMGPUPackSharedMemoryAlloc());
+ funcPassManager.addPass(createLLVMGPUPackSharedMemoryAlloc());
}
//===---------------------------------------------------------------------===//
// Transpose
//===---------------------------------------------------------------------===//
-void addGPUTransposePassPipeline(OpPassManager &pm) {
- tileAndDistributeToWorkgroup(pm);
- auto &nestedModulePM = pm.nest<ModuleOp>();
+void addGPUTransposePassPipeline(OpPassManager &funcPassManager) {
+ tileAndDistributeToWorkgroup(funcPassManager);
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createWorkgroupSpecializationPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createWorkgroupSpecializationPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(
createGPUTensorAlloc(GPUPromoteSharedMemPattern::TransposeOpPattern));
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUTensorTile(false));
+ funcPassManager.addPass(createGPUTensorTile(false));
// Linalg -> vector
- addGPUVectorizationPasses(nestedModulePM);
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeVectorTransferPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
+ addGPUVectorizationPasses(funcPassManager);
+ funcPassManager.addPass(createOptimizeVectorTransferPass());
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
// tensor to memref
- addBufferizePasses(nestedModulePM);
+ addBufferizePasses(funcPassManager);
// distribute foreach threads
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUDistribute());
+ funcPassManager.addPass(createGPUDistribute());
- nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGPUDistributeSharedMemoryCopy());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createMemrefCopyToLinalgPass());
+ funcPassManager.addPass(createGPUDistributeSharedMemoryCopy());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// May or may not need to reduce shared mememory conflicts
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(
createGPUReduceSharedMemoryBankConflicts(/*paddingSizeBits=*/32));
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
//===---------------------------------------------------------------------===//
@@ -556,88 +509,77 @@
return success();
}
-static void addVectorBufferizePasses(OpPassManager &passManager) {
+static void addVectorBufferizePasses(OpPassManager &funcPassManager) {
BufferizationOptions::AllocationFn allocationFn = gpuAllocationFn;
BufferizationOptions::MemCpyFn memcpyFn = gpuCopyFn;
- addIREEComprehensiveBufferizePasses(passManager, allocationFn, memcpyFn);
- passManager.addPass(createCanonicalizerPass());
- passManager.addPass(createCSEPass());
+ addIREEComprehensiveBufferizePasses(funcPassManager, allocationFn, memcpyFn);
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
-void addGPUVectorDistributePassPipeline(OpPassManager &pm) {
- tileAndDistributeToWorkgroup(pm);
- auto &nestedModulePM = pm.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(createReorderWorkgroups(
+void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager) {
+ tileAndDistributeToWorkgroup(funcPassManager);
+ funcPassManager.addPass(createReorderWorkgroups(
clReorderWorkgroupsStrategy, clReorderWorkgroupsLogSwizzleTile,
canReorderWorkgroups));
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Problem specific (reduction) tiling.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGPUTensorTileToSerialLoops());
+ funcPassManager.addPass(createGPUTensorTileToSerialLoops());
// Generalize all named ops so that we can fold away unit extent dims. By this
// point, all tiling is finished so the tiling configurations on those ops can
// be safely dropped. This additionally allows vectorization of convolution to
// `vector.contract` as filter dimensions are expected to be tiled to 1 by
// this point.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLinalgGeneralizeNamedOpsPass());
+ funcPassManager.addPass(createLinalgGeneralizeNamedOpsPass());
LinalgFoldUnitExtentDimsPassOptions options;
options.useRankReducingSlices = true;
- nestedModulePM.addNestedPass<func::FuncOp>(
- mlir::createLinalgFoldUnitExtentDimsPass(options));
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(mlir::createLinalgFoldUnitExtentDimsPass(options));
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
// Linalg -> Vector
- addGPUVectorizationPasses(nestedModulePM);
+ addGPUVectorizationPasses(funcPassManager);
// Allocate tensors for copies to shared memory.
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUVectorAlloc());
+ funcPassManager.addPass(createGPUVectorAlloc());
// Tensor -> Memref
- addVectorBufferizePasses(nestedModulePM);
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createHoistStaticallyBoundAllocationsPass());
+ addVectorBufferizePasses(funcPassManager);
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createHoistStaticallyBoundAllocationsPass());
// Vector SIMD -> Vector SIMT
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLLVMGPUCastTypeToFitMMAPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUVectorDistribute());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
-
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(createLLVMGPUCastTypeToFitMMAPass());
+ funcPassManager.addPass(createLLVMGPUVectorDistribute());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(
createGPUReduceSharedMemoryBankConflicts(/*paddingSizeBits=*/64));
+ funcPassManager.addPass(createGPUReduceSharedMemoryBankConflicts());
if (clLLVMGPUEnablePrefetch) {
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLLVMGPUPrefetchSharedMemoryPass());
+ funcPassManager.addPass(createLLVMGPUPrefetchSharedMemoryPass());
}
-
- nestedModulePM.addNestedPass<func::FuncOp>(
- memref::createFoldMemRefAliasOpsPass());
- nestedModulePM.addPass(createCSEPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
-void addGPUWarpReductionPassPipeline(OpPassManager &pm) {
- tileAndDistributeToWorkgroup(pm);
- auto &nestedModulePM = pm.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRematerializeParallelOpsPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUTileReductionPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager) {
+ tileAndDistributeToWorkgroup(funcPassManager);
+ funcPassManager.addPass(createRematerializeParallelOpsPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createGPUTileReductionPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Linalg -> vector
{
@@ -648,122 +590,104 @@
options.vectorizeGatherAccesses = true;
options.enableCleanup = false;
options.generateContract = false;
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGenericVectorizationPass(options));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createGenericVectorizationPass(options));
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLoopInvariantCodeMotionPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createLoopInvariantCodeMotionPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- addBufferizePasses(nestedModulePM);
+ addBufferizePasses(funcPassManager);
- nestedModulePM.addNestedPass<func::FuncOp>(
- memref::createFoldMemRefAliasOpsPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeVectorTransferPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLoopInvariantCodeMotionPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+ funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
+ funcPassManager.addPass(createOptimizeVectorTransferPass());
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createLoopInvariantCodeMotionPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createForOpCanonicalizationPass());
+ funcPassManager.addPass(createCanonicalizerPass());
auto getSubgroupSizeFn = [](mlir::FunctionOpInterface func) -> int {
- auto moduleOp = func->getParentOfType<ModuleOp>();
- llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps =
- getAllEntryPoints(moduleOp);
- IREE::HAL::ExecutableExportOp exportOp = exportOps.lookup(func.getName());
- std::optional<int64_t> maybeSubgroupSize = getSubgroupSize(exportOp);
- return maybeSubgroupSize.value_or(kDefaultSubgroupSize);
+ // TODO: This kind of call back function is a really really bad idea
+ // This should be easier to resolve than doing this.
+ if (std::optional<int64_t> maybeSubgroupSize = getSubgroupSize(func)) {
+ return maybeSubgroupSize.value();
+ }
+ return kDefaultSubgroupSize;
};
// vector -> simt gpu + vector
- nestedModulePM.addNestedPass<func::FuncOp>(
- createConvertVectorReductionToGPUPass(/*expandSubgroupReduction=*/true,
- getSubgroupSizeFn));
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createConvertVectorReductionToGPUPass(
+ /*expandSubgroupReduction=*/true, getSubgroupSizeFn));
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
-void addGPUPackUnPackPasses(OpPassManager &pm) {
- tileAndDistributeToWorkgroup(pm);
- auto &nestedModulePM = pm.nest<ModuleOp>();
+void addGPUPackUnPackPasses(OpPassManager &funcPassManager) {
+ tileAndDistributeToWorkgroup(funcPassManager);
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createWorkgroupSpecializationPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createWorkgroupSpecializationPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUTensorTile(false));
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(createGPUTensorTile(false));
+ funcPassManager.addPass(
createDecomposePackUnPackOpsPass(/*tileOuterToOne=*/true));
- addGPUVectorizationPasses(nestedModulePM);
+ addGPUVectorizationPasses(funcPassManager);
- addBufferizePasses(nestedModulePM);
+ addBufferizePasses(funcPassManager);
// distribute foreach threads
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUDistribute());
+ funcPassManager.addPass(createGPUDistribute());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createSplitFullPartialTransferPass("linalg-copy"));
+ funcPassManager.addPass(createSplitFullPartialTransferPass("linalg-copy"));
}
-void addGPUSimpleDistributePassPipeline(OpPassManager &pm) {
- tileAndBufferize(pm);
+void addGPUSimpleDistributePassPipeline(OpPassManager &funcPassManager) {
+ tileAndBufferize(funcPassManager);
- auto &nestedModulePM = pm.nest<ModuleOp>();
// Distribute linalg onto threads within the workgroup.
- nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUTileAndDistribute());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createLLVMGPUTileAndDistribute());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRemoveSingleIterationLoopPass());
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
}
-void addGPUDefaultPassPipeline(OpPassManager &pm, bool enableMicrokernels) {
- tileAndDistributeToWorkgroup(pm, /*useWARForCooperativeMatrixCodegen=*/true);
- auto &nestedModulePM = pm.nest<ModuleOp>();
+void addGPUDefaultPassPipeline(OpPassManager &funcPassManager,
+ bool enableMicrokernels) {
+ tileAndDistributeToWorkgroup(funcPassManager,
+ /*useWARForCooperativeMatrixCodegen=*/true);
if (enableMicrokernels) {
- nestedModulePM.addPass(createGPULowerToUKernelsPass());
+ funcPassManager.addPass(createGPULowerToUKernelsPass());
}
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- addBufferizePasses(nestedModulePM);
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRemoveSingleIterationLoopPass());
+ addBufferizePasses(funcPassManager);
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
}
-void addGPUBaseLoweringPassPipeline(OpPassManager &pm) {
- auto &nestedModulePM = pm.nest<ModuleOp>();
+void addGPUBaseLoweringPassPipeline(OpPassManager &funcPassManager) {
+ funcPassManager.addPass(createConvertToDestinationPassingStylePass(
+ /*useWARForCooperativeMatrixCodegen=*/false));
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createConvertToDestinationPassingStylePass(
- /*useWARForCooperativeMatrixCodegen=*/false));
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ addBufferizePasses(funcPassManager);
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- addBufferizePasses(nestedModulePM);
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
-
- nestedModulePM.addNestedPass<func::FuncOp>(
- IREE::LinalgExt::createLinalgExtToLoopsPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRemoveSingleIterationLoopPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(IREE::LinalgExt::createLinalgExtToLoopsPass());
+ funcPassManager.addPass(createMemrefCopyToLinalgPass());
+ funcPassManager.addPass(createConvertLinalgToLoopsPass());
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
// Add passes to make the address computation more explicit and optimize them.
@@ -773,132 +697,150 @@
// loops.
//
// Note that this needs to run before SCF -> CF.
-static void addLowerAndOptimizeAddressComputationPasses(OpPassManager &pm) {
- pm.addPass(createExtractAddressComputationGPUPass());
- pm.addNestedPass<func::FuncOp>(memref::createExpandOpsPass());
- pm.addPass(memref::createFoldMemRefAliasOpsPass());
- pm.addPass(memref::createExpandStridedMetadataPass());
- // Hoist loop invariant variables to give affine decomposition pass the right
- // loop dependencies.
- pm.addPass(createLoopInvariantCodeMotionPass());
- // Decompose affine ops.
- pm.addPass(createDecomposeAffineOpsPass());
- // Get rid of the redundant computations.
- pm.addPass(createCSEPass());
- // Hoist the resulting decompositions.
- pm.addPass(createLoopInvariantCodeMotionPass());
- pm.addPass(createLowerAffinePass());
+static void
+addLowerAndOptimizeAddressComputationPasses(FunctionLikeNest &funcPassManager) {
+ funcPassManager.addPass(createExtractAddressComputationGPUPass)
+ .addPass(memref::createExpandOpsPass)
+ .addPass(memref::createFoldMemRefAliasOpsPass)
+ .addPass(memref::createExpandStridedMetadataPass)
+ // Hoist loop invariant variables to give affine decomposition pass the
+ // right loop dependencies.
+ .addPass(createLoopInvariantCodeMotionPass)
+ // Decompose affine ops.
+ .addPass(createDecomposeAffineOpsPass)
+ // Get rid of the redundant computations.
+ .addPass(createCSEPass)
+ // Hoist the resulting decompositions.
+ .addPass(createLoopInvariantCodeMotionPass)
+ .addPass(createLowerAffinePass);
}
-static void addLowerToLLVMGPUPasses(OpPassManager &pm, bool forROCDL) {
- pm.addPass(createConvertHALDescriptorTypeToGPUAddressSpacePass());
- pm.addPass(createCanonicalizerPass());
- pm.addPass(createCSEPass());
+static void addLowerToLLVMGPUPasses(OpPassManager &modulePassManager,
+ bool forROCDL) {
+ modulePassManager.addPass(
+ createConvertHALDescriptorTypeToGPUAddressSpacePass());
+ modulePassManager.addPass(createCanonicalizerPass());
+ modulePassManager.addPass(createCSEPass());
- pm.addPass(createLowerUKernelOpsToCallsPass());
+ modulePassManager.addPass(createLowerUKernelOpsToCallsPass());
- // LinalgExt -> SCF
- pm.addNestedPass<func::FuncOp>(IREE::LinalgExt::createLinalgExtToLoopsPass());
+ FunctionLikeNest(modulePassManager)
+ // LinalgExt -> SCF
+ .addPass(IREE::LinalgExt::createLinalgExtToLoopsPass)
- // Linalg -> SCF
- pm.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
- pm.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());
- pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- pm.addNestedPass<func::FuncOp>(createCSEPass());
+ // Linalg -> SCF
+ .addPass(createMemrefCopyToLinalgPass)
+ .addPass(createConvertLinalgToLoopsPass)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass)
- // Pad allocations with dynamic dimension after linalg lowering but before
- // lowering SCF and affine ops.
- pm.addNestedPass<func::FuncOp>(createPadDynamicAlloc());
+ // Pad allocations with dynamic dimension after linalg lowering but before
+ // lowering SCF and affine ops.
+ .addPass(createPadDynamicAlloc)
- pm.addPass(createLowerAffinePass());
- pm.addPass(createCanonicalizerPass());
- pm.addPass(createCSEPass());
+ .addPass(createLowerAffinePass)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass);
// Handled tensor constants.
- pm.addPass(arith::createConstantBufferizePass());
- pm.addPass(createFoldTensorExtractOpPass());
+ modulePassManager.addPass(arith::createConstantBufferizePass());
- pm.addNestedPass<func::FuncOp>(createLLVMGPUVectorLoweringPass());
+ FunctionLikeNest funcPassManager(modulePassManager);
+ funcPassManager.addPass(createFoldTensorExtractOpPass)
+ .addPass(createLLVMGPUVectorLoweringPass);
// This pass needs to run before SCF -> CF.
- addLowerAndOptimizeAddressComputationPasses(pm);
+ addLowerAndOptimizeAddressComputationPasses(funcPassManager);
// Run checks on shared memory usage.
- auto getSharedMemoryLimitInBytes = [](mlir::FunctionOpInterface entryPoint) {
- return getTargetSharedMemoryLimitInBytes(entryPoint);
- };
- // TODO: query this from the target.
- auto getIndexBitwidth = [](mlir::FunctionOpInterface) { return 64; };
- pm.addPass(createGPUCheckResourceUsagePass(getSharedMemoryLimitInBytes,
- getIndexBitwidth));
-
- // SCF -> CF
- pm.addNestedPass<func::FuncOp>(createConvertSCFToCFPass());
- pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- pm.addNestedPass<func::FuncOp>(createCSEPass());
-
- // Handle complex operation conversion.
- pm.addPass(createConvertComplexToStandardPass());
-
- // Convert BF16 operations to occur as F32.
- pm.addPass(createConvertBf16ArithToF32Pass());
- pm.addPass(createConvertBf16ToUInt16BuffersPass());
-
- // Convert math dialect elementry functions to polynomial form.
- pm.addNestedPass<func::FuncOp>(createPolynomialApproximationPass());
-
- pm.addNestedPass<func::FuncOp>(memref::createExpandOpsPass());
- pm.addPass(memref::createFoldMemRefAliasOpsPass());
- pm.addPass(memref::createExpandStridedMetadataPass());
- pm.addPass(createEmulateNarrowTypePass());
- pm.addPass(affine::createAffineExpandIndexOpsPass());
- pm.addPass(createLowerAffinePass());
- pm.addPass(createCanonicalizerPass());
- pm.addPass(createCSEPass());
+ funcPassManager
+ .addPass([&]() {
+ // TODO: query this from the target.
+ auto getSharedMemoryLimit = [](mlir::FunctionOpInterface entryPoint) {
+ return getTargetSharedMemoryLimitInBytes(entryPoint);
+ };
+ auto getIndexBitwidth = [](mlir::FunctionOpInterface) { return 64; };
+ return createGPUCheckResourceUsagePass(getSharedMemoryLimit,
+ getIndexBitwidth);
+ })
+ // SCF -> CF
+ .addPass(createConvertSCFToCFPass)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass)
+ // Handle complex operation conversion.
+ .addPass(createConvertComplexToStandardPass)
+ // Convert BF16 operations to occur as F32.
+ .addPass(createConvertBf16ArithToF32Pass)
+ .addPass(createConvertBf16ToUInt16BuffersPass)
+ // Convert math dialect elementry functions to polynomial form.
+ .addPass(createPolynomialApproximationPass)
+ .addPass(memref::createExpandOpsPass)
+ .addPass(memref::createFoldMemRefAliasOpsPass)
+ .addPass(memref::createExpandStridedMetadataPass)
+ .addPass(createEmulateNarrowTypePass)
+ .addPass(affine::createAffineExpandIndexOpsPass)
+ .addPass(createLowerAffinePass)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass);
// Strip out the debug info for the kernel.
- pm.addPass(createStripDebugInfoPass());
+ modulePassManager.addPass(createStripDebugInfoPass());
// Cast address spaces of all function arguments to generic.
- pm.addPass(createLLVMGPUCastAddressSpaceFunction());
+ modulePassManager.addPass(createLLVMGPUCastAddressSpaceFunction());
if (forROCDL) {
// convert to ROCDL.
- pm.addPass(createConvertToROCDLPass());
+ modulePassManager.addPass(createConvertToROCDLPass());
} else {
// convert to NVVM.
- pm.addPass(createConvertToNVVMPass());
+ modulePassManager.addPass(createConvertToNVVMPass());
}
}
extern llvm::cl::opt<std::string> clGPUCodegenTransformDialectDebugPayloadTag;
extern llvm::cl::opt<std::string> clGPUCodegenTransformDialectDebugTransformTag;
-void addGPUTransformDialectPasses(OpPassManager &passManager,
+void addGPUTransformDialectPasses(OpPassManager &funcPassManager,
StringRef entryPoint) {
- passManager.addPass(
+ funcPassManager.addPass(
mlir::iree_compiler::createTransformDialectInterpreterPass(entryPoint));
// Dropping the schedule is needed:
// 1. if we want to embed the transform in the module: we should drop the
// schedule once applied.
// 2. if transform.do_not_dce_operands ops are introduced.
- passManager.addPass(createDropSchedulePass());
+ funcPassManager.addPass(createDropSchedulePass());
}
//===----------------------------------------------------------------------===//
// Common Pass Pipelines
//===----------------------------------------------------------------------===//
-void buildLLVMGPUCodegenConfigurationPassPipeline(OpPassManager &pm) {
- auto &nestedModulePM = pm.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUGeneralizeNamedOpsPass());
- addCommonTargetExecutablePreprocessingPasses(pm);
- pm.addPass(createLLVMGPUSelectLoweringStrategyPass());
+static void buildLLVMGPUCodegenConfigurationPassPipelineImpl(
+ OpPassManager &modulePassManager) {
+ {
+ FunctionLikeNest funcPassManager(modulePassManager);
+ funcPassManager.addPass(createGPUGeneralizeNamedOpsPass);
+ addCommonTargetExecutablePreprocessingPasses(funcPassManager);
+ }
+ modulePassManager.addPass(createMaterializeUserConfigsPass());
+
+ modulePassManager.addPass(createLLVMGPUSelectLoweringStrategyPass());
}
-void buildLLVMGPUCodegenPassPipeline(OpPassManager &pm, bool useROCM) {
- pm.addPass(createLLVMGPULowerExecutableTargetPass());
- OpPassManager &nestedModulePM = pm.nest<ModuleOp>();
+void buildLLVMGPUCodegenConfigurationPassPipeline(
+ OpPassManager &variantPassManager) {
+ buildLLVMGPUCodegenConfigurationPassPipelineImpl(
+ variantPassManager.nest<ModuleOp>());
+}
+
+void buildLLVMGPUCodegenPassPipeline(OpPassManager &variantPassManager,
+ bool useROCM) {
+ OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
+ modulePassManager.addPass(createLowerExecutableUsingTransformDialectPass());
+ FunctionLikeNest(modulePassManager)
+ .addPass(createLLVMGPULowerExecutableTargetPass);
+ variantPassManager.addPass(createReconcileTranslationInfoPass());
//===--------------------------------------------------------------------===//
// Convert Linalg ops to LLVM+NVVM/ROCDL ops.
//
@@ -906,11 +848,11 @@
// - All Linalg/Loops/GPU/Affine/Standard ops are converted away.
// - The module contains the final llvm.module ready to be serialized.
//===--------------------------------------------------------------------===//
- addLowerToLLVMGPUPasses(nestedModulePM, useROCM);
+ addLowerToLLVMGPUPasses(modulePassManager, useROCM);
LLVM_DEBUG({
llvm::dbgs() << "Using LLVMGPU pass pipeline:\n";
- pm.printAsTextualPipeline(llvm::dbgs());
+ variantPassManager.printAsTextualPipeline(llvm::dbgs());
llvm::dbgs() << "\n";
});
}
@@ -919,21 +861,35 @@
// ROCDL Pass Pipelines
//===----------------------------------------------------------------------===//
-void buildROCDLCodegenConfigurationPassPipeline(OpPassManager &pm) {
- auto &nestedModulePM = pm.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUGeneralizeNamedOpsPass());
- addCommonTargetExecutablePreprocessingPasses(pm);
- pm.addPass(createROCDLSelectLoweringStrategyPass());
+static void buildROCDLCodegenConfigurationPassPipelineImpl(
+ OpPassManager &modulePassManager) {
+ {
+ FunctionLikeNest funcPassManager(modulePassManager);
+ funcPassManager.addPass(createGPUGeneralizeNamedOpsPass);
+ addCommonTargetExecutablePreprocessingPasses(funcPassManager);
+ }
+ modulePassManager.addPass(createMaterializeUserConfigsPass());
+
+ modulePassManager.addPass(createROCDLSelectLoweringStrategyPass());
}
-void buildROCDLCodegenPassPipeline(OpPassManager &pm) {
- pm.addPass(createROCDLLowerExecutableTargetPass());
- OpPassManager &nestedModulePM = pm.nest<ModuleOp>();
- addLowerToLLVMGPUPasses(nestedModulePM, /*forROCDL=*/true);
+void buildROCDLCodegenConfigurationPassPipeline(
+ OpPassManager &variantPassManager) {
+ OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
+ buildROCDLCodegenConfigurationPassPipelineImpl(modulePassManager);
+}
+
+void buildROCDLCodegenPassPipeline(OpPassManager &variantPassManager) {
+ OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
+ modulePassManager.addPass(createLowerExecutableUsingTransformDialectPass());
+ FunctionLikeNest(modulePassManager)
+ .addPass(createROCDLLowerExecutableTargetPass);
+ variantPassManager.addPass(createReconcileTranslationInfoPass());
+ addLowerToLLVMGPUPasses(modulePassManager, /*forROCDL=*/true);
LLVM_DEBUG({
llvm::dbgs() << "Using ROCDL pass pipeline:\n";
- pm.printAsTextualPipeline(llvm::dbgs());
+ variantPassManager.printAsTextualPipeline(llvm::dbgs());
llvm::dbgs() << "\n";
});
}
@@ -953,9 +909,10 @@
static PassPipelineRegistration<> LLVMGPUConfigPipeline(
"iree-codegen-llvmgpu-configuration-pipeline",
- "Runs the translation strategy configuration pipeline on Linalg for GPU",
- [](OpPassManager &passManager) {
- buildLLVMGPUCodegenConfigurationPassPipeline(passManager);
+ "Runs the translation strategy configuration pipeline on Linalg for GPU "
+ "on all functions in a module",
+ [](OpPassManager &modulePassManager) {
+ buildLLVMGPUCodegenConfigurationPassPipelineImpl(modulePassManager);
});
static PassPipelineRegistration<> LinalgNVVMPipeline(
@@ -989,8 +946,8 @@
static PassPipelineRegistration<> ROCDLConfigPipeline(
"iree-codegen-rocdl-configuration-pipeline",
"Runs pass pipeline to select a suitable lowering strategy for ROCDL",
- [](OpPassManager &passManager) {
- buildROCDLCodegenConfigurationPassPipeline(passManager);
+ [](OpPassManager &modulePassManager) {
+ buildROCDLCodegenConfigurationPassPipelineImpl(modulePassManager);
});
static PassPipelineRegistration<> LinalgROCDLPipeline(
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index d30c89d..f99cfec 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -22,52 +22,56 @@
//===----------------------------------------------------------------------===//
/// Lowering using SIMT CUDA core operations.
-void addGPUMatmulSimtPassPipeline(OpPassManager &pm);
+void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager);
/// Lowering using mma.sync Tensor Core operations.
-void addGPUMatmulTensorCoreMmaSyncPassPipeline(OpPassManager &pm,
+void addGPUMatmulTensorCoreMmaSyncPassPipeline(OpPassManager &funcPassManager,
unsigned pipelineDepth);
/// Lowering using wmma Tensor Core operations.
-void addGPUMatmulTensorCorePassPipeline(OpPassManager &pm,
+void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
unsigned pipelineDepth);
-void addGPUPackUnPackPasses(OpPassManager &pm);
+void addGPUPackUnPackPasses(OpPassManager &funcPassManager);
/// Simple lowering only distributute linalg ops on blocks and threads. This
/// will result in scalar operations. Expects pass manager to be a
/// module-level pass manager.
-void addGPUSimpleDistributePassPipeline(OpPassManager &pm);
+void addGPUSimpleDistributePassPipeline(OpPassManager &funcPassManager);
/// Transform dialect-based path.
-void addGPUTransformDialectPasses(OpPassManager &pm, StringRef entryPoint);
+void addGPUTransformDialectPasses(OpPassManager &funcPassManager,
+ StringRef entryPoint);
/// Lowering transpose using shared memory.
-void addGPUTransposePassPipeline(OpPassManager &pm);
+void addGPUTransposePassPipeline(OpPassManager &funcPassManager);
/// Lowering calling vectorization patterns. Expects pass manager to be a
/// module-level pass manager.
-void addGPUVectorizationPassPipeline(OpPassManager &pm);
+void addGPUVectorizationPassPipeline(OpPassManager &funcPassManager);
/// Lowering based on vector distribution patterns.
-void addGPUVectorDistributePassPipeline(OpPassManager &pm);
+void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager);
/// Lowering reductions to warp reductions.
-void addGPUWarpReductionPassPipeline(OpPassManager &pm);
+void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager);
/// Default pass pipeline on GPU, currently used only for the ukernel path.
-void addGPUDefaultPassPipeline(OpPassManager &pm, bool enableMicrokernels);
+void addGPUDefaultPassPipeline(OpPassManager &funcPassManager,
+ bool enableMicrokernels);
/// Pass pipeline to lower IREE HAL executables without tiling and distribution.
void addGPUBaseLoweringPassPipeline(OpPassManager &pm);
/// Populates passes needed to preprocess and select the translation strategy.
-void buildLLVMGPUCodegenConfigurationPassPipeline(OpPassManager &pm);
+void buildLLVMGPUCodegenConfigurationPassPipeline(
+ OpPassManager &variantPassManagery);
/// Populates passes needed to lower a XLA HLO op to NVVM/ROCDL dialect via
/// the structured ops path. The pass manager `pm` in here should operate on
/// the module within the IREE::HAL::ExecutableOp.
-void buildLLVMGPUCodegenPassPipeline(OpPassManager &pm, bool useROCM);
+void buildLLVMGPUCodegenPassPipeline(OpPassManager &variantPassManagery,
+ bool useROCM);
/// Performs the final conversion to NNVM+LLVM dialect.
std::unique_ptr<OperationPass<ModuleOp>> createConvertToNVVMPass();
@@ -88,16 +92,16 @@
createLLVMGPUDistribute();
/// Create pass selecting the lowering strategy for LLVMGPU.
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
createLLVMGPUSelectLoweringStrategyPass();
/// Create pass calling the dynamic pipeline for LLVMGPU.
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPULowerExecutableTargetPass();
// Pass to pack shared memory allocations in order to reduce shared memory
// usage.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUPackSharedMemoryAlloc();
std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
@@ -109,29 +113,28 @@
};
/// Convert Linalg ops to Vector and prepare converstion to GPU MMA ops.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUTensorCoreVectorizationPass(
GPUTensorCoreType tensorCoreType = GPUTensorCoreType::WMMA);
//. Pass to pad out tensors up to static dimensions.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUTensorPadPass();
/// Perform tiling and distribution to threads.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUTileAndDistribute(bool distributeToWarp = false);
// Pass to distribute vectorized functions.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUVectorDistribute();
/// Lower vector ops before convertion to LLVM.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createLLVMGPUVectorLoweringPass();
/// Converts vector ops to gpu dialect.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMGPUVectorToGPU(
+std::unique_ptr<InterfacePass<FunctionOpInterface>> createLLVMGPUVectorToGPU(
GPUTensorCoreType tensorCoreType = GPUTensorCoreType::WMMA);
/// Lowering calling vectorization patterns.
@@ -162,7 +165,7 @@
/// are the same and transposed from the LHS layout, this type
/// of transformation can avoid trips to shared memory/shuffle instructions
/// on operators like Flash Attention.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createAMDGPUPrepareForChainedMatmulPass();
//----------------------------------------------------------------------------//
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
index 80f0a20..cb3349e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
@@ -61,7 +61,7 @@
}
def LLVMGPULowerExecutableTarget :
- Pass<"iree-llvmgpu-lower-executable-target", "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> {
+ InterfacePass<"iree-llvmgpu-lower-executable-target", "mlir::FunctionOpInterface"> {
let summary = "Perform lowering of executable target using one of the IREE::HAL::DispatchLoweringPassPipeline";
let constructor = "mlir::iree_compiler::createLLVMGPULowerExecutableTargetPass()";
}
@@ -79,7 +79,7 @@
}
def LLVMGPUSelectLoweringStrategy :
- Pass<"iree-llvmgpu-select-lowering-strategy", "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> {
+ Pass<"iree-llvmgpu-select-lowering-strategy", "ModuleOp"> {
let summary = "Select a IREE::HAL::DispatchLoweringPassPipeline for lowering the target variant";
let constructor = "mlir::iree_compiler::createLLVMGPUSelectLoweringStrategyPass()";
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp
index 0d92fc7..b9196b5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp
@@ -337,90 +337,82 @@
// Entry Point
//===----------------------------------------------------------------------===//
-LogicalResult initROCDLLaunchConfig(ModuleOp moduleOp) {
- llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps =
- getAllEntryPoints(moduleOp);
+LogicalResult initROCDLLaunchConfig(FunctionOpInterface funcOp) {
- for (auto funcOp : moduleOp.getOps<mlir::FunctionOpInterface>()) {
- auto exportOp = exportOps.lookup(funcOp.getName());
- if (!exportOp)
- continue;
-
- // First check whether we already have workgroup count set--it's a
- // "contract" to indicate that we should bypass all tiling and
- // distribution to go down just the most basic lowering flow.
- if (Block *body = exportOp.getWorkgroupCountBody()) {
+ // First check whether we already have workgroup count set--it's a
+ // "contract" to indicate that we should bypass all tiling and
+ // distribution to go down just the most basic lowering flow.
+ if (auto exportOp = getEntryPoint(funcOp)) {
+ if (Block *body = exportOp->getWorkgroupCountBody()) {
auto retOp = cast<IREE::HAL::ReturnOp>(body->getTerminator());
// For scalar dispatch cases--using just one thread of one workgroup.
auto isOne = [](Value value) { return matchPattern(value, m_One()); };
if (llvm::all_of(retOp.getOperands(), isOne)) {
std::array<int64_t, 3> workgroupSize = {1, 1, 1};
- if (failed(setDispatchConfig(funcOp, workgroupSize, std::nullopt)))
- return failure();
auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
funcOp.getContext(),
- IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUBaseLowering);
+ IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUBaseLowering,
+ workgroupSize);
if (failed(setTranslationInfo(funcOp, translationInfo))) {
return failure();
}
- continue;
+ return success();
}
}
+ }
- SmallVector<Operation *> computeOps = getComputeOps(funcOp);
- if (getTranslationInfo(exportOp)) {
- // Currently ROCDL requires propagation of user lowering configs.
- for (auto op : computeOps) {
- if (getLoweringConfig(op)) {
- propagateLoweringConfig(op, computeOps);
- break;
- }
+ SmallVector<Operation *> computeOps = getComputeOps(funcOp);
+ if (getTranslationInfo(funcOp)) {
+ // Currently ROCDL requires propagation of user lowering configs.
+ for (auto op : computeOps) {
+ if (getLoweringConfig(op)) {
+ propagateLoweringConfig(op, computeOps);
+ break;
}
- continue;
}
+ }
- Operation *rootOp = nullptr;
+ Operation *rootOp = nullptr;
- // Find the root operation. linalg.generic and linalg.fill are not root
- // operations if there are other compute operations present.
- for (Operation *op : llvm::reverse(computeOps)) {
- if (!isa<linalg::GenericOp, linalg::FillOp>(op)) {
+ // Find the root operation. linalg.generic and linalg.fill are not root
+ // operations if there are other compute operations present.
+ for (Operation *op : llvm::reverse(computeOps)) {
+ if (!isa<linalg::GenericOp, linalg::FillOp>(op)) {
+ rootOp = op;
+ break;
+ }
+ if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) {
+ // linalg.generic with `reduction` iterator types are roots as well.
+ if (genericOp.getNumLoops() != genericOp.getNumParallelLoops()) {
rootOp = op;
break;
}
- if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) {
- // linalg.generic with `reduction` iterator types are roots as well.
- if (genericOp.getNumLoops() != genericOp.getNumParallelLoops()) {
- rootOp = op;
- break;
- }
- }
}
-
- if (!rootOp) {
- for (Operation *op : llvm::reverse(computeOps)) {
- if (isa<linalg::GenericOp, linalg::FillOp>(op)) {
- rootOp = op;
- break;
- }
- }
- }
-
- if (!rootOp) {
- // No root operation found, set it to none.
- auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
- funcOp.getContext(), CodeGenPipeline::None);
- if (failed(setTranslationInfo(funcOp, translationInfo))) {
- return failure();
- }
- continue;
- }
-
- if (failed(setRootConfig(funcOp, rootOp)))
- continue;
-
- propagateLoweringConfig(rootOp, computeOps);
}
+
+ if (!rootOp) {
+ for (Operation *op : llvm::reverse(computeOps)) {
+ if (isa<linalg::GenericOp, linalg::FillOp>(op)) {
+ rootOp = op;
+ break;
+ }
+ }
+ }
+
+ if (!rootOp) {
+ // No root operation found, set it to none.
+ auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
+ funcOp.getContext(), CodeGenPipeline::None);
+ if (failed(setTranslationInfo(funcOp, translationInfo))) {
+ return failure();
+ }
+ return success();
+ }
+
+ if (failed(setRootConfig(funcOp, rootOp)))
+ return failure();
+
+ propagateLoweringConfig(rootOp, computeOps);
return success();
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.h
index 69fa139..b616ff7 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.h
@@ -7,11 +7,11 @@
#ifndef IREE_COMPILER_CODEGEN_LLVMGPU_ROCDLKERNELCONFIG_H_
#define IREE_COMPILER_CODEGEN_LLVMGPU_ROCDLKERNELCONFIG_H_
-#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
namespace mlir::iree_compiler {
-LogicalResult initROCDLLaunchConfig(ModuleOp moduleOp);
+LogicalResult initROCDLLaunchConfig(FunctionOpInterface funcOp);
} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLLowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLLowerExecutableTarget.cpp
index 83c7e88..c2cba1c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLLowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLLowerExecutableTarget.cpp
@@ -4,12 +4,14 @@
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "iree/compiler/Codegen/Common/PassUtils.h"
#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
#include "iree/compiler/Codegen/LLVMGPU/ROCDLPassDetail.h"
#include "iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h"
#include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -35,19 +37,24 @@
}
void runOnOperation() override {
- IREE::HAL::ExecutableVariantOp variantOp = getOperation();
+ auto funcOp = getOperation();
- std::optional<IREE::Codegen::TranslationInfoAttr> translationInfo =
- getIdenticalTranslationInfo(variantOp);
+ IREE::Codegen::TranslationInfoAttr translationInfo =
+ getTranslationInfo(funcOp);
if (!translationInfo) {
- variantOp.emitError(
- "unsupported entry point functions with different translation info");
- return signalPassFailure();
+ return;
}
- OpPassManager pipeline(variantOp.getOperationName());
+ std::optional<OpPassManager> maybePipeline =
+ getFunctionOpInterfacePassManager(funcOp);
+ if (!maybePipeline) {
+ funcOp.emitOpError(
+ "unhandled function-like container during executable lowering");
+ return signalPassFailure();
+ }
+ OpPassManager &pipeline = maybePipeline.value();
- switch (translationInfo.value().getDispatchLoweringPassPipeline()) {
+ switch (translationInfo.getDispatchLoweringPassPipeline()) {
case CodeGenPipeline::LLVMGPUBaseLowering:
addGPUBaseLoweringPassPipeline(pipeline);
break;
@@ -58,18 +65,18 @@
case IREE::Codegen::DispatchLoweringPassPipeline::None:
return;
default:
- variantOp.emitOpError("unsupported pipeline on ROCDL target");
+ funcOp.emitOpError("unsupported pipeline on ROCDL target");
return signalPassFailure();
}
- if (failed(runPipeline(pipeline, variantOp))) {
+ if (failed(runPipeline(pipeline, funcOp))) {
return signalPassFailure();
}
}
};
} // namespace
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createROCDLLowerExecutableTargetPass() {
return std::make_unique<ROCDLLowerExecutableTargetPass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h
index 3f8ffb4..122eebf 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h
@@ -18,11 +18,11 @@
/// Creates a pass that calls a dynamic pipeline to progressively lower Linalg
/// with tensor semantics to ROCDL.
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createROCDLLowerExecutableTargetPass();
/// Creates a pass to select the lowering strategy for converting to ROCDL.
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
createROCDLSelectLoweringStrategyPass();
//===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td
index 6c23d53..29e592f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td
@@ -13,18 +13,16 @@
// ROCDL Passes (keep alphabetical)
//===----------------------------------------------------------------------===//
-def ROCDLLowerExecutableTarget : Pass<
- "iree-rocdl-lower-executable-target",
- "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> {
+def ROCDLLowerExecutableTarget : InterfacePass<
+ "iree-rocdl-lower-executable-target", "mlir::FunctionOpInterface"> {
let summary = "Lower an IREE hal.executable.variant op using a suitable "
"pass pipeline";
let constructor =
"mlir::iree_compiler::createROCDLLowerExecutableTargetPass()";
}
-def ROCDLSelectLoweringStrategy : Pass<
- "iree-rocdl-select-lowering-strategy",
- "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> {
+def ROCDLSelectLoweringStrategy :
+ Pass<"iree-rocdl-select-lowering-strategy", "ModuleOp"> {
let summary = "Select a suitable lowering strategy for an IREE "
"hal.executable.variant op";
let constructor =
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLSelectLoweringStrategy.cpp
index 5286cc0..b5d4836 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLSelectLoweringStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLSelectLoweringStrategy.cpp
@@ -30,25 +30,18 @@
}
void runOnOperation() override {
- IREE::HAL::ExecutableVariantOp variantOp = getOperation();
- ModuleOp moduleOp = variantOp.getInnerModule();
-
- if (failed(initROCDLLaunchConfig(moduleOp))) {
- return signalPassFailure();
- }
-
- std::optional<IREE::Codegen::TranslationInfoAttr> translationInfo =
- getIdenticalTranslationInfo(variantOp);
- if (!translationInfo) {
- moduleOp.emitError(
- "unsupported entry point functions with different translation info");
- return signalPassFailure();
+ auto moduleOp = getOperation();
+ for (auto funcOp : moduleOp.getOps<FunctionOpInterface>()) {
+ if (failed(initROCDLLaunchConfig(funcOp))) {
+ funcOp.emitOpError("failed to set configuration");
+ return signalPassFailure();
+ }
}
}
};
} // namespace
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
createROCDLSelectLoweringStrategyPass() {
return std::make_unique<ROCDLSelectLoweringStrategyPass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/BUILD.bazel
index 9c870dc..17c96e1 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/BUILD.bazel
@@ -59,6 +59,7 @@
":LLVMGPUExtensionsOpGen",
"//compiler/src/iree/compiler/Codegen/Common",
"//compiler/src/iree/compiler/Codegen/Common/GPU:CommonGPUPasses",
+ "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
"//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
"//compiler/src/iree/compiler/Codegen/LLVMGPU/Utils",
"//compiler/src/iree/compiler/Codegen/Utils",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/CMakeLists.txt
index 3afc5fa..c9e2ef5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/CMakeLists.txt
@@ -64,6 +64,7 @@
MLIRViewLikeInterface
iree::compiler::Codegen::Common
iree::compiler::Codegen::Common::GPU::CommonGPUPasses
+ iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect
iree::compiler::Codegen::LLVMGPU::Utils
iree::compiler::Codegen::Utils
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
index 2f448b6..48f27c2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -10,6 +10,7 @@
#include "iree/compiler/Codegen/Common/GPU/GPUPatterns.h"
#include "iree/compiler/Codegen/Common/GPU/GPUVectorDistribution.h"
#include "iree/compiler/Codegen/Common/GPU/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h"
#include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
@@ -83,14 +84,6 @@
transform::TransformRewriter &rewriter, mlir::FunctionOpInterface target,
transform::ApplyToEachResultList &results,
transform::TransformState &state) {
- FailureOr<IREE::HAL::ExecutableExportOp> maybeExportOp =
- getEntryPoint(target);
- if (failed(maybeExportOp)) {
- state.getTopLevel()->emitOpError("no IREE::HAL::ExecutableExportOp found");
- return emitDefaultDefiniteFailure(target);
- }
- IREE::HAL::ExecutableExportOp exportOp = *maybeExportOp;
-
auto transformOp = cast<transform::TransformOpInterface>(getOperation());
rewriter.setInsertionPointToStart(&target.getFunctionBody().front());
@@ -100,12 +93,15 @@
getSyncAfterDistribution());
if (!diag.succeeded())
return diag;
- auto newAttr = rewriter.getIndexArrayAttr(getWorkgroupDims());
- auto subgroupSizeAttr = rewriter.getIndexAttr(getSubgroupSize());
- rewriter.startOpModification(exportOp);
- exportOp->setAttr(exportOp.getWorkgroupSizeAttrName(), newAttr);
- exportOp->setAttr(exportOp.getSubgroupSizeAttrName(), subgroupSizeAttr);
- rewriter.finalizeOpModification(exportOp);
+ if (failed(setTranslationInfo(
+ target, IREE::Codegen::TranslationInfoAttr::get(
+ rewriter.getContext(),
+ IREE::Codegen::DispatchLoweringPassPipeline::None,
+ getWorkgroupDims(), getSubgroupSize())))) {
+ target->emitOpError("failed to update translation info");
+ return emitDefaultDefiniteFailure(target);
+ }
+
return DiagnosedSilenceableFailure::success();
}
@@ -273,56 +269,17 @@
return VectorDistributionResult{warpOp};
}
-// TODO: Refactor in a generic util that can be reused.
-static HAL::ExecutableExportOp
-getExecutableExportOpForFunc(HAL::ExecutableVariantOp halExecutableVariantOp,
- mlir::FunctionOpInterface funcOp) {
- if (!halExecutableVariantOp || !funcOp)
- return {};
- HAL::ExecutableExportOp exportOp;
- halExecutableVariantOp->walk([&](HAL::ExecutableExportOp op) {
- if (op.getSymName() != funcOp.getName())
- return WalkResult::advance();
- exportOp = op;
- return WalkResult::interrupt();
- });
- return exportOp;
-}
-
DiagnosedSilenceableFailure
transform_dialect::VectorToWarpExecuteOnLane0Op::applyToOne(
transform::TransformRewriter &rewriter, scf::IfOp target,
transform::ApplyToEachResultList &results,
transform::TransformState &state) {
- if (!isa<HAL::ExecutableOp, HAL::ExecutableVariantOp>(state.getTopLevel())) {
- results.assign(1, nullptr);
- return emitDefaultSilenceableFailure(state.getTopLevel())
- << "requires HAL::ExecutableOp or "
- "HAL::ExecutableVariantOp toplevel "
- "so that IR is properly isolated. This is required so "
- "we can "
- "safely inspect the HAL::ExecutableExportOp under "
- "multi-threaded "
- "pass assumptions.";
- }
-
- auto halExecutableVariantOp =
- target->getParentOfType<HAL::ExecutableVariantOp>();
auto funcOp = target->getParentOfType<mlir::FunctionOpInterface>();
- HAL::ExecutableExportOp exportOp =
- getExecutableExportOpForFunc(halExecutableVariantOp, funcOp);
- if (!halExecutableVariantOp || !funcOp || !exportOp) {
- // Return a silenceable failure and set the expected 1 result to
- // nullptr.
- results.assign(1, nullptr);
- return emitDefaultSilenceableFailure(target)
- << "export op is missing --- the transform is not "
- "applied";
- }
- std::optional<ArrayAttr> maybeAttr = exportOp.getWorkgroupSize();
+ std::optional<SmallVector<int64_t>> maybeWorkgroupSize =
+ getWorkgroupSize(funcOp);
// TODO: Pervasive 3 constant in IREE.
- if (!maybeAttr || maybeAttr->size() != 3) {
+ if (!maybeWorkgroupSize || maybeWorkgroupSize->empty()) {
// Return a silenceable failure and set the expected 1 result to
// nullptr.
results.assign(1, nullptr);
@@ -332,7 +289,7 @@
"--- the transform is not applied";
}
- int64_t workgroupSizeX = llvm::cast<IntegerAttr>((*maybeAttr)[0]).getInt();
+ int64_t workgroupSizeX = (*maybeWorkgroupSize)[0];
int64_t warpSize = getWarpSize();
if (workgroupSizeX % warpSize != 0) {
// Return a silenceable failure and set the expected 1 result to
@@ -602,18 +559,10 @@
transform::TransformRewriter &rewriter, mlir::FunctionOpInterface target,
transform::ApplyToEachResultList &results,
transform::TransformState &state) {
- FailureOr<IREE::HAL::ExecutableExportOp> maybeExportOp =
- getEntryPoint(target);
- if (failed(maybeExportOp)) {
- state.getTopLevel()->emitOpError("no IREE::HAL::ExecutableExportOp found");
- return emitDefaultDefiniteFailure(target);
- }
- IREE::HAL::ExecutableExportOp exportOp = *maybeExportOp;
-
- std::optional<llvm::APInt> subgroupSize = exportOp.getSubgroupSize();
+ auto subgroupSize = getSubgroupSize(target);
if (!subgroupSize) {
- state.getTopLevel()->emitOpError(
- "could not extract subgroup size from IREE::HAL::ExecutableExportOp");
+ target->emitOpError(
+ "could not extract subgroup size from IREE::Codegen::TranslationInfo");
return emitDefaultDefiniteFailure(target);
}
@@ -646,9 +595,9 @@
RewritePatternSet patterns(ctx);
populateVectorTransferWriteDistribution(target, patterns,
/*benefit=*/2);
+ unsigned subgroupSizeU = static_cast<unsigned>(subgroupSize.value());
populatePropagateVectorDistribution(target, patterns,
- /*benefit=*/1,
- subgroupSize->getSExtValue());
+ /*benefit=*/1, subgroupSizeU);
if (failed(
applyPatternsAndFoldGreedily(target, std::move(patterns), config))) {
return mlir::emitDefiniteFailure(
@@ -893,23 +842,8 @@
if (op->hasAttr("__parallel_region_boundary_for_test"))
return true;
- // We consider functions inside executable variants that have the same symbol
- // name as an export symbol.
- auto func = dyn_cast<FunctionOpInterface>(op);
- if (!func)
- return false;
- auto parent = op->getParentOfType<ModuleOp>();
- if (!parent)
- return false;
- auto variant = parent->getParentOfType<HAL::ExecutableVariantOp>();
- if (!variant)
- return false;
- WalkResult result = variant.walk([&](HAL::ExecutableExportOp exportOp) {
- if (exportOp.getSymNameAttr() == func.getNameAttr())
- return WalkResult::interrupt();
- return WalkResult::skip();
- });
- return result.wasInterrupted();
+ // We consider functions inside executable variants .
+ return isa<FunctionOpInterface>(op);
}
/// Returns `true` if the op behaves like a sequential loop, e.g., the control
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
index 2ed206d..baa673e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
@@ -1,5 +1,5 @@
// RUN: iree-opt --split-input-file --iree-codegen-llvmgpu-use-vector-distribution \
-// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy)))" %s | FileCheck %s
+// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s
// TODO: This test is still using the legacy LLVMGPU kernel config. This needs
// to be migrated to the rocdl heuristics, but for now is just physically
@@ -12,63 +12,33 @@
// CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 4,
// CHECK-SAME: subgroup_m_tile_count = 4, subgroup_n_tile_count = 1, subgroup_k_tile_count = 8
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @expanded_matmul_transpose_b_executable {
-hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {
- target_arch = "gfx940",
- mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
- #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>]
- }>) {
- hal.executable.export @expanded_matmul_transpose_b layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @expanded_matmul_transpose_b() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0)
- : !flow.dispatch.tensor<readonly:tensor<2x64x2048xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0)
- : !flow.dispatch.tensor<readonly:tensor<10x64x2048xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0)
- : !flow.dispatch.tensor<writeonly:tensor<2x10x64x64xf16>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 64, 2048], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<2x64x2048xf16>> -> tensor<2x64x2048xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 2048], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<10x64x2048xf16>> -> tensor<10x64x2048xf16>
-
- %5 = tensor.empty() : tensor<2x10x64x64xf16>
- %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2x10x64x64xf16>) -> tensor<2x10x64x64xf16>
- %7 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4)>,
- affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)>,
- affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>
- ],
- iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]
- } ins(%3, %4 : tensor<2x64x2048xf16>, tensor<10x64x2048xf16>) outs(%6 : tensor<2x10x64x64xf16>) {
- ^bb0(%lhs: f16, %rhs: f16, %out: f16):
- %mul = arith.mulf %lhs, %rhs : f16
- %add = arith.addf %mul, %out : f16
- linalg.yield %add : f16
- } -> tensor<2x10x64x64xf16>
-
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 64], strides = [1, 1, 1, 1]
- : tensor<2x10x64x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x10x64x64xf16>>
- return
- }
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx940"}>
+#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4)>
+#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)>
+#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>
+module {
+ func.func @expanded_matmul_transpose_b() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x64x2048xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<10x64x2048xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x10x64x64xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 64, 2048], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x64x2048xf16>> -> tensor<2x64x2048xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 2048], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x64x2048xf16>> -> tensor<10x64x2048xf16>
+ %5 = tensor.empty() : tensor<2x10x64x64xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2x10x64x64xf16>) -> tensor<2x10x64x64xf16>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<2x64x2048xf16>, tensor<10x64x2048xf16>) outs(%6 : tensor<2x10x64x64xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %out: f16):
+ %8 = arith.mulf %in, %in_0 : f16
+ %9 = arith.addf %8, %out : f16
+ linalg.yield %9 : f16
+ } -> tensor<2x10x64x64xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 64], strides = [1, 1, 1, 1] : tensor<2x10x64x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x10x64x64xf16>>
+ return
}
}
-// CHECK-LABEL: hal.executable public @expanded_matmul_transpose_b
+// CHECK-LABEL: func.func @expanded_matmul_transpose_b()
// CHECK: linalg.generic {{.*}}lowering_config = #[[$TILE_SIZES]]
// -----
@@ -80,81 +50,46 @@
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2,
// CHECK-SAME: subgroup_m_tile_count = 2, subgroup_n_tile_count = 4, subgroup_k_tile_count = 2
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @conv_nhwc {
-hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {
- target_arch = "gfx940",
- mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
- #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>]
- }>) {
- hal.executable.export @conv_nhwc layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @conv_nhwc() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x258x514x768xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3x3x768x256xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x256x512x256xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 258, 514, 768], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x258x514x768xf16>> -> tensor<2x258x514x768xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 768, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x768x256xf16>> -> tensor<3x3x768x256xf16>
- %5 = tensor.empty() : tensor<2x256x512x256xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x256x512x256xf32>) -> tensor<2x256x512x256xf32>
- %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<2x258x514x768xf16>, tensor<3x3x768x256xf16>) outs(%6 : tensor<2x256x512x256xf32>) -> tensor<2x256x512x256xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 256, 512, 256], strides = [1, 1, 1, 1] : tensor<2x256x512x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x256x512x256xf32>>
- return
- }
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx940"}>
+module {
+ func.func @conv_nhwc() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x258x514x768xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3x3x768x256xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x256x512x256xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 258, 514, 768], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x258x514x768xf16>> -> tensor<2x258x514x768xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 768, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x768x256xf16>> -> tensor<3x3x768x256xf16>
+ %5 = tensor.empty() : tensor<2x256x512x256xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x256x512x256xf32>) -> tensor<2x256x512x256xf32>
+ %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<2x258x514x768xf16>, tensor<3x3x768x256xf16>) outs(%6 : tensor<2x256x512x256xf32>) -> tensor<2x256x512x256xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 256, 512, 256], strides = [1, 1, 1, 1] : tensor<2x256x512x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x256x512x256xf32>>
+ return
}
}
-// CHECK-LABEL: hal.executable public @conv_nhwc
+// CHECK-LABEL: func.func @conv_nhwc()
// CHECK: linalg.conv_2d_nhwc_hwcf {{.*}} lowering_config = #[[$TILE_SIZES]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @matmul_256x256x256 {
-hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {
- target_arch = "gfx940",
- mma_intrinsics = []
- }>) {
- hal.executable.export @matmul_256x256x256 layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @matmul_256x256x256() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
- %5 = tensor.empty() : tensor<256x256xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<256x256xf32>) -> tensor<256x256xf32>
- %7 = linalg.matmul ins(%3, %4 : tensor<256x256xf16>, tensor<256x256xf16>) outs(%6 : tensor<256x256xf32>) -> tensor<256x256xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
- return
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [], target_arch = "gfx940"}>
+module {
+ func.func @matmul_256x256x256() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
+ %5 = tensor.empty() : tensor<256x256xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<256x256xf32>) -> tensor<256x256xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<256x256xf16>, tensor<256x256xf16>) outs(%6 : tensor<256x256xf32>) -> tensor<256x256xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
+ return
}
}
-}
// Check that we do not use the distribute pipeline if there are no supported
// intrinsics.
@@ -169,43 +104,25 @@
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2,
// CHECK-SAME: subgroup_m_tile_count = 2, subgroup_n_tile_count = 4, subgroup_k_tile_count = 4
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @mfma_matmul_1024x1024x1024 {
-hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {
- target_arch = "gfx940",
- mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
- #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>]
- }>) {
- hal.executable.export @mfma_matmul_1024x1024x1024 layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @mfma_matmul_1024x1024x1024() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>> -> tensor<1024x1024xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>> -> tensor<1024x1024xf16>
- %5 = tensor.empty() : tensor<1024x1024xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
- %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
- return
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx940"}>
+module {
+ func.func @mfma_matmul_1024x1024x1024() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>> -> tensor<1024x1024xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>> -> tensor<1024x1024xf16>
+ %5 = tensor.empty() : tensor<1024x1024xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
+ return
}
}
-}
-// CHECK-LABEL: hal.executable public @mfma_matmul_1024x1024x1024
+// CHECK-LABEL: func.func @mfma_matmul_1024x1024x1024()
// CHECK: linalg.matmul {{.*}}lowering_config = #[[$TILE_SIZES]]
// -----
@@ -217,56 +134,43 @@
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2,
// CHECK-SAME: subgroup_m_tile_count = 1, subgroup_n_tile_count = 1, subgroup_k_tile_count = 2
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @conv_nchwc {
-hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {
- target_arch = "gfx940",
- mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
- #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>]
- }>) {
- hal.executable.export @conv_nchwc layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @conv_nchwc() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x20x34x34x64xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x20x3x3x160x64xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x8x32x32x160xf16>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0], sizes = [2, 20, 34, 34, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x20x34x34x64xf16>> -> tensor<2x20x34x34x64xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 20, 3, 3, 160, 64], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x20x3x3x160x64xf16>> -> tensor<8x20x3x3x160x64xf16>
- %5 = tensor.empty() : tensor<2x8x32x32x160xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x8x32x32x160xf32>) -> tensor<2x8x32x32x160xf32>
- %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d5, d2 + d6, d3 + d7, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d1, d5, d6, d7, d4, d8)>, affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d2, d3, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction", "reduction"]} ins(%3, %4 : tensor<2x20x34x34x64xf16>, tensor<8x20x3x3x160x64xf16>) outs(%6 : tensor<2x8x32x32x160xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 1, 32, 0, 1, 1, 1, 0]]>} {
- ^bb0(%in: f16, %in_0: f16, %out: f32):
- %10 = arith.extf %in : f16 to f32
- %11 = arith.extf %in_0 : f16 to f32
- %12 = arith.mulf %10, %11 : f32
- %13 = arith.addf %out, %12 : f32
- linalg.yield %13 : f32
- } -> tensor<2x8x32x32x160xf32>
- %8 = tensor.empty() : tensor<2x8x32x32x160xf16>
- %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x8x32x32x160xf32>) outs(%8 : tensor<2x8x32x32x160xf16>) {
- ^bb0(%in: f32, %out: f16):
- %10 = arith.truncf %in : f32 to f16
- linalg.yield %10 : f16
- } -> tensor<2x8x32x32x160xf16>
- flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0, 0, 0], sizes = [2, 8, 32, 32, 160], strides = [1, 1, 1, 1, 1] : tensor<2x8x32x32x160xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x8x32x32x160xf16>>
- return
- }
-
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 1, 32, 0, 1, 1, 1, 0]]>
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx940"}>
+#map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d5, d2 + d6, d3 + d7, d8)>
+#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d1, d5, d6, d7, d4, d8)>
+#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d2, d3, d4)>
+#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
+module {
+ func.func @conv_nchwc() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x20x34x34x64xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x20x3x3x160x64xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x8x32x32x160xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0], sizes = [2, 20, 34, 34, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x20x34x34x64xf16>> -> tensor<2x20x34x34x64xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 20, 3, 3, 160, 64], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x20x3x3x160x64xf16>> -> tensor<8x20x3x3x160x64xf16>
+ %5 = tensor.empty() : tensor<2x8x32x32x160xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x8x32x32x160xf32>) -> tensor<2x8x32x32x160xf32>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction", "reduction"]} ins(%3, %4 : tensor<2x20x34x34x64xf16>, tensor<8x20x3x3x160x64xf16>) outs(%6 : tensor<2x8x32x32x160xf32>) attrs = {lowering_config = #config} {
+ ^bb0(%in: f16, %in_0: f16, %out: f32):
+ %10 = arith.extf %in : f16 to f32
+ %11 = arith.extf %in_0 : f16 to f32
+ %12 = arith.mulf %10, %11 : f32
+ %13 = arith.addf %out, %12 : f32
+ linalg.yield %13 : f32
+ } -> tensor<2x8x32x32x160xf32>
+ %8 = tensor.empty() : tensor<2x8x32x32x160xf16>
+ %9 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x8x32x32x160xf32>) outs(%8 : tensor<2x8x32x32x160xf16>) {
+ ^bb0(%in: f32, %out: f16):
+ %10 = arith.truncf %in : f32 to f16
+ linalg.yield %10 : f16
+ } -> tensor<2x8x32x32x160xf16>
+ flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0, 0, 0], sizes = [2, 8, 32, 32, 160], strides = [1, 1, 1, 1, 1] : tensor<2x8x32x32x160xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x8x32x32x160xf16>>
+ return
}
}
-// CHECK-LABEL: hal.executable public @conv_nchwc
+// CHECK-LABEL: func.func @conv_nchwc()
// CHECK: linalg.generic {{.*}}lowering_config = #[[$TILE_SIZES]]
// -----
@@ -278,40 +182,23 @@
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2,
// CHECK-SAME: subgroup_m_tile_count = 2, subgroup_n_tile_count = 4, subgroup_k_tile_count = 4
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @wmma_matmul_1024x1024x1024 {
-hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {
- target_arch = "gfx1100",
- mma_intrinsics = [#iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>]
- }>) {
- hal.executable.export @wmma_matmul_1024x1024x1024 layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @wmma_matmul_1024x1024x1024() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>> -> tensor<1024x1024xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>> -> tensor<1024x1024xf16>
- %5 = tensor.empty() : tensor<1024x1024xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
- %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
- return
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>], target_arch = "gfx1100"}>
+module {
+ func.func @wmma_matmul_1024x1024x1024() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>> -> tensor<1024x1024xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf16>> -> tensor<1024x1024xf16>
+ %5 = tensor.empty() : tensor<1024x1024xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
+ return
}
}
-}
-// CHECK-LABEL: hal.executable public @wmma_matmul_1024x1024x1024
+// CHECK-LABEL: func.func @wmma_matmul_1024x1024x1024()
// CHECK: linalg.matmul {{.*}}lowering_config = #[[$TILE_SIZES]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/lowering_scalar_dispatch.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/lowering_scalar_dispatch.mlir
index 16dfc3a..d885f72 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/lowering_scalar_dispatch.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/lowering_scalar_dispatch.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-rocdl-select-lowering-strategy, iree-rocdl-lower-executable-target)))' -mlir-print-local-scope %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-rocdl-select-lowering-strategy, func.func(iree-rocdl-lower-executable-target)))))' -mlir-print-local-scope %s | FileCheck %s
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx90a", ukernels = "none"}>
@@ -30,11 +30,8 @@
}
}
-// CHECK-LABEL: hal.executable.export public @scalar_dispatch
-// CHECK-SAME: translation_info = #iree_codegen.translation_info<LLVMGPUBaseLowering>
-// CHECK-SAME: workgroup_size = [1 : index, 1 : index, 1 : index]
-
-// CHECK: func.func @scalar_dispatch()
+// CHECK-LABEL: func.func @scalar_dispatch()
+// CHECK-SAME: translation_info = #iree_codegen.translation_info<LLVMGPUBaseLowering workgroup_size = [1, 1, 1]>
// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan set(0) binding(0)
// CHECK: %[[SPAN1:.+]] = hal.interface.binding.subspan set(0) binding(1)
// CHECK: memref.load %[[SPAN0]][] : memref<i64, #hal.descriptor_type<storage_buffer>>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
index a3163bc..99a864b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
@@ -1,5 +1,5 @@
// RUN: iree-opt --split-input-file --iree-codegen-llvmgpu-use-vector-distribution --iree-llvmgpu-enable-prefetch=true \
-// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))" %s | FileCheck %s
+// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s
// TODO: This test is still using the legacy LLVMGPU kernel config. This needs
// to be migrated to the rocdl heuristics, but for now is just physically
@@ -43,16 +43,12 @@
// Basic pipeline test to make sure it generates the instructions we expect.
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2, subgroup_m_tile_count = 2, subgroup_n_tile_count = 2, subgroup_k_tile_count = 8>
-// CHECK-LABEL: hal.executable.export public @matmul_256x256x256_f16_f32
-// CHECK-SAME: subgroup_size = 64
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [128 : index, 2 : index, 1 : index]
-
-// CHECK-LABEL: func.func @matmul_256x256x256_f16_f32
+// CHECK-LABEL: func.func @matmul_256x256x256_f16_f32()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: scf.for {{.*}} = %c0 to %c256 step %c128 iter_args({{.*}}) -> (vector<2x2x1x1x1x4xf32>)
// Each subgroup handles 2 * 2 tiles, and for each tile we accumulate 8 times
// along the K dimension. So in total 32 mfma ops.
@@ -98,16 +94,12 @@
}
}
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2, subgroup_m_tile_count = 2, subgroup_n_tile_count = 2, subgroup_k_tile_count = 8>
-// CHECK-LABEL: hal.executable.export public @matmul_256x256x256_f16_f16
-// CHECK-SAME: subgroup_size = 64
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [128 : index, 2 : index, 1 : index]
-
-// CHECK-LABEL: func.func @matmul_256x256x256_f16_f16
+// CHECK-LABEL: func.func @matmul_256x256x256_f16_f16()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: scf.for {{.*}} = %c0 to %c256 step %c128 iter_args(%[[ARG:.+]] = {{.*}}) -> (vector<2x2x1x1x1x4xf16>)
// CHECK: arith.extf %[[ARG]] : vector<2x2x1x1x1x4xf16> to vector<2x2x1x1x1x4xf32>
// CHECK-COUNT-32: amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
@@ -173,10 +165,9 @@
}
}
-// CHECK-LABEL: hal.executable.export public @expanded_matmul_transpose_b
-// CHECK-SAME: workgroup_size = [256 : index, 1 : index, 1 : index]
-
-// CHECK-LABEL: func @expanded_matmul_transpose_b
+// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
+// CHECK: func @expanded_matmul_transpose_b
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// This has more than 2 iteartions. So we have prefetching enabled for this case. Due to
// prefetching, we have one iteration peeled of so upper bound is 2048 - 128 = 1920.
// CHECK: scf.for {{.*}} = %c0 to %c1920 step %c128 iter_args(%[[ARG:.+]] = {{.*}}) -> (vector<4x1x1x1x1x4xf16>)
@@ -302,15 +293,10 @@
}
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2, subgroup_m_tile_count = 2, subgroup_n_tile_count = 2, subgroup_k_tile_count = 8>
-// CHECK-LABEL: hal.executable.export public @generic_2x1024x20x64x1280_f16
-// CHECK-SAME: subgroup_size = 64
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [128 : index, 2 : index, 1 : index]
-
// CHECK-LABEL: func.func @generic_2x1024x20x64x1280_f16
// This has more than 2 iteartions. So we have prefetching enabled for this case. Due to
// prefetching, we have one iteration peeled of so upper bound is 1280 - 128 = 1152.
@@ -359,16 +345,13 @@
}
}
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
// CHECK-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F32>,
// CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2, subgroup_m_tile_count = 2, subgroup_n_tile_count = 2, subgroup_k_tile_count = 8>
-// CHECK-LABEL: hal.executable.export public @matmul_256x256x256_f16_f32
-// CHECK-SAME: subgroup_size = 32
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 2 : index, 1 : index]
// CHECK-LABEL: func.func @matmul_256x256x256_f16_f32
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: scf.for {{.*}} = %c0 to %c256 step %c128 iter_args({{.*}}) -> (vector<2x2x8x1x1x1xf32>)
// Each subgroup handles 2 * 2 tiles, and for each tile we accumulate 8 times
// along the K dimension. So in total 32 wmma ops.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_warp_reduction.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_warp_reduction.mlir
index 61b72b5..2f78cb2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_warp_reduction.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_warp_reduction.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-rocdl-configuration-pipeline, iree-codegen-linalg-to-rocdl-pipeline2)))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-rocdl-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline2)))" %s | FileCheck %s
hal.executable private @warp_reduction {
hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
index dc7c533..49cf593 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
@@ -1,31 +1,23 @@
-// RUN: iree-opt %s --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-transform-dialect-interpreter)))' \
+// RUN: iree-opt %s --pass-pipeline='builtin.module(iree-transform-dialect-interpreter)' \
// RUN: --iree-codegen-transform-dialect-library=%p/attention_transform_spec.mlir| \
// RUN: FileCheck --check-prefix=CHECK %s
-hal.executable @_attention_dispatch_0 {
- hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>) {
- hal.executable.export public @_attention_dispatch_0 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @_attention_dispatch_0() {
- %c0 = arith.constant 0 : index
- %scale = arith.constant 0.125 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<192x1024x64xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<192x1024x64xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<192x1024x64xf16>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<192x1024x64xf16>>
- %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<192x1024x64xf16>> -> tensor<192x1024x64xf16>
- %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<192x1024x64xf16>> -> tensor<192x1024x64xf16>
- %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<192x1024x64xf16>> -> tensor<192x1024x64xf16>
- %7 = tensor.empty() : tensor<192x1024x64xf16>
- %8 = iree_linalg_ext.attention ins(%4, %5, %6, %scale : tensor<192x1024x64xf16>, tensor<192x1024x64xf16>, tensor<192x1024x64xf16>, f16) outs(%7 : tensor<192x1024x64xf16>) -> tensor<192x1024x64xf16>
- flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : tensor<192x1024x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<192x1024x64xf16>>
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
+module {
+ func.func @_attention_dispatch_0() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 1.250000e-01 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<192x1024x64xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<192x1024x64xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<192x1024x64xf16>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<192x1024x64xf16>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<192x1024x64xf16>> -> tensor<192x1024x64xf16>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<192x1024x64xf16>> -> tensor<192x1024x64xf16>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<192x1024x64xf16>> -> tensor<192x1024x64xf16>
+ %7 = tensor.empty() : tensor<192x1024x64xf16>
+ %8 = iree_linalg_ext.attention ins(%4, %5, %6, %cst : tensor<192x1024x64xf16>, tensor<192x1024x64xf16>, tensor<192x1024x64xf16>, f16) outs(%7 : tensor<192x1024x64xf16>) -> tensor<192x1024x64xf16>
+ flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : tensor<192x1024x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<192x1024x64xf16>>
+ return
}
}
@@ -37,7 +29,9 @@
// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
// CHECK-DAG: #[[MAP6:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
// CHECK-DAG: #[[MAP7:.+]] = affine_map<(d0, d1) -> (d1, d0)>
-// CHECK: func.func @_attention_dispatch_0() {
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<None workgroup_size = [4, 8, 4] subgroup_size = 32>
+// CHECK: func.func @_attention_dispatch_0()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<32x64xf32>
// CHECK-DAG: %[[CST_0:.+]] = arith.constant dense<-1.000000e+30> : vector<32xf32>
// CHECK-DAG: %[[CST_1:.+]] = arith.constant dense<0.000000e+00> : vector<32xf32>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma.mlir
index 439dc88..59983fc 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma.mlir
@@ -1,37 +1,29 @@
-// RUN: iree-opt %s --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-transform-dialect-interpreter)))' \
+// RUN: iree-opt %s --pass-pipeline='builtin.module(iree-transform-dialect-interpreter)' \
// RUN: --iree-codegen-transform-dialect-library=%p/attention_mfma_transform_spec.mlir | \
// RUN: FileCheck --check-prefix=CHECK %s
-hal.executable private @attention {
- hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
- hal.executable.export public @attention_dispatch_0_attention_16x16384x128xf16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @attention_dispatch_0_attention_16x16384x128xf16() {
- // CHECK-NOT: vector.contract
- // CHECK-NOT: iree_vector_ext.to_simd
- // CHECK-NOT: iree_vector_ext.to_simt
- // CHECK-COUNT-8: vector.load {{.*}} : memref<16x16384x128xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16>
- // CHECK: scf.for {{.*}} = %c0 to %c16384 step %c64 {{.*}} -> (vector<2xf32>, vector<2xf32>, vector<8x2x4xf32>)
- // CHECK-COUNT-16: vector.load {{.*}} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<8xf16>
- // CHECK-COUNT-128: amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32}
- %c0 = arith.constant 0 : index
- %scale = arith.constant 0.08838834764 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x16384x128xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x16384x128xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x16384x128xf16>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x16384x128xf16>>
- %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x16384x128xf16>> -> tensor<16x16384x128xf16>
- %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x16384x128xf16>> -> tensor<16x16384x128xf16>
- %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x16384x128xf16>> -> tensor<16x16384x128xf16>
- %7 = tensor.empty() : tensor<16x16384x128xf16>
- %8 = iree_linalg_ext.attention ins(%4, %5, %6, %scale : tensor<16x16384x128xf16>, tensor<16x16384x128xf16>, tensor<16x16384x128xf16>, f16) outs(%7 : tensor<16x16384x128xf16>) -> tensor<16x16384x128xf16>
- flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : tensor<16x16384x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<16x16384x128xf16>>
- return
- }
- }
+#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb">
+module {
+ func.func @attention_dispatch_0_attention_16x16384x128xf16() attributes {hal.executable.target = #executable_target} {
+ %c0 = arith.constant 0 : index
+ %scale = arith.constant 0.08838834764 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x16384x128xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x16384x128xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x16384x128xf16>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x16384x128xf16>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x16384x128xf16>> -> tensor<16x16384x128xf16>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x16384x128xf16>> -> tensor<16x16384x128xf16>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x16384x128xf16>> -> tensor<16x16384x128xf16>
+ %7 = tensor.empty() : tensor<16x16384x128xf16>
+ %8 = iree_linalg_ext.attention ins(%4, %5, %6, %scale : tensor<16x16384x128xf16>, tensor<16x16384x128xf16>, tensor<16x16384x128xf16>, f16) outs(%7 : tensor<16x16384x128xf16>) -> tensor<16x16384x128xf16>
+ flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : tensor<16x16384x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<16x16384x128xf16>>
+ return
}
}
+ // CHECK-NOT: vector.contract
+ // CHECK-NOT: iree_vector_ext.to_simd
+ // CHECK-NOT: iree_vector_ext.to_simt
+ // CHECK-COUNT-8: vector.load {{.*}} : memref<16x16384x128xf16, #hal.descriptor_type<storage_buffer>>, vector<8xf16>
+ // CHECK: scf.for {{.*}} = %c0 to %c16384 step %c64 {{.*}} -> (vector<2xf32>, vector<2xf32>, vector<8x2x4xf32>)
+ // CHECK-COUNT-16: vector.load {{.*}} : memref<64x128xf16, #gpu.address_space<workgroup>>, vector<8xf16>
+ // CHECK-COUNT-128: amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma_transform_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma_transform_spec.mlir
index 284792f..ea07b77 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma_transform_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma_transform_spec.mlir
@@ -1,7 +1,7 @@
#layout = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>
module attributes { transform.with_named_sequence } {
- transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.consumed}) {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op) {
// Get attention op
// ==========================================
%attention = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
@@ -11,7 +11,7 @@
%tiled_attention, %forall_grid =
transform.structured.tile_using_forall %attention tile_sizes [1, 128]
( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+ // transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
// Tile batch dimensions of attention
// ==========================================
@@ -127,18 +127,18 @@
transform.apply_cse to %func_3 : !transform.any_op
transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
transform.apply_patterns to %func_3 { transform.apply_patterns.linalg.erase_unnecessary_inputs } : !transform.any_op
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+ %func_4 = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op)
// Step 5. Pre-process the contract and transfer ops to put it in the right form.
// ===========================================================================
- %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %func_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %func_2 {
transform.apply_patterns.iree.fold_arith_ext_into_contraction
} : !transform.any_op
// Step 6. Post-bufferization vector distribution
// ===========================================================================
- %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %func_7 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [64, 4, 1] subgroup_size = 64 : (!transform.any_op) -> ()
@@ -162,14 +162,14 @@
transform.apply_registered_pass "iree-amdgpu-prepare-chained-matmul" to %func_8 : (!transform.any_op) -> (!transform.any_op)
// Get the vector.contract ops.
- %contracts = transform.structured.match ops{["vector.contract"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %contracts = transform.structured.match ops{["vector.contract"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%contract1, %contract2 = transform.split_handle %contracts : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
%layout16x16x16 = transform.param.constant #layout -> !transform.any_param
transform.iree.set_contraction_layout_attributes %contract1, %layout16x16x16 { read_layout_indices = array<i64: 0, 1> } : !transform.any_op, !transform.any_param
transform.iree.set_contraction_layout_attributes %contract2, %layout16x16x16 : !transform.any_op, !transform.any_param
- %distribute_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %distribute_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.iree.amdgpu_distribute_vectors %distribute_func test_conversion : !transform.any_op
transform.apply_patterns to %distribute_func {
@@ -179,7 +179,7 @@
// Distribute shared memory copies
// ==========================================
- %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %func_10 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.iree.gpu_distribute_shared_memory_copy %func_10 : (!transform.any_op) -> ()
transform.apply_patterns to %func_10 {
transform.apply_patterns.memref.fold_memref_alias_ops
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir
index f819a30..152b754 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_transform_spec.mlir
@@ -1,5 +1,5 @@
module attributes { transform.with_named_sequence } {
- transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.consumed}) {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op) {
// Get attention op
// ==========================================
%attention = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
@@ -9,7 +9,7 @@
%tiled_attention, %forall_grid =
transform.structured.tile_using_forall %attention tile_sizes [1, 128]
( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
- transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+ // transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
// Tile batch dimensions of attention
// ==========================================
@@ -123,18 +123,18 @@
transform.apply_cse to %func_3 : !transform.any_op
transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
transform.apply_patterns to %func_3 { transform.apply_patterns.linalg.erase_unnecessary_inputs } : !transform.any_op
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+ %func_4 = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op)
// Step 5. Pre-process the contract and transfer ops to put it in the right form.
// ===========================================================================
- %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %func_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %func_2 {
transform.apply_patterns.iree.prepare_vector_to_mma
} : !transform.any_op
// Step 6. Post-bufferization vector distribution
// ===========================================================================
- %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %func_7 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 4] subgroup_size = 32 sync_after_distribution = false : (!transform.any_op) -> ()
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
index 258bf1b..dd97742 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
@@ -1,347 +1,252 @@
-// RUN: iree-opt --split-input-file \
-// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy)))' \
-// RUN: %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-hal.executable @dynamic_batch_matvec {
-hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100"}>) {
- hal.executable.export @dynamic_batch_matvec layout(#pipeline_layout)
- builtin.module {
- func.func @dynamic_batch_matvec() {
- %c32_i64 = arith.constant 32 : i64
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = hal.interface.constant.load[4] : i32
- %13 = arith.index_castui %0 : i32 to index
- %18 = arith.index_castui %1 : i32 to index
- %19 = arith.index_castui %2 : i32 to index
- %24 = arith.index_castui %3 : i32 to index
- %29 = arith.index_castui %4 : i32 to index
- %30 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%19) : !flow.dispatch.tensor<writeonly:tensor<32x1x128xf16>>
- %31 = flow.dispatch.workload.ordinal %24, 0 : index
- %32 = flow.dispatch.workload.ordinal %29, 1 : index
- %33 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%13) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x1x?xf16>>{%31}
- %34 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%18) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x?x128xf16>>{%32}
- %35 = flow.dispatch.tensor.load %33, offsets = [0, 0, 0], sizes = [32, 1, %31], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x1x?xf16>>{%31} -> tensor<32x1x?xf16>
- %36 = flow.dispatch.tensor.load %34, offsets = [0, 0, 0], sizes = [32, %32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x?x128xf16>>{%32} -> tensor<32x?x128xf16>
- %37 = tensor.empty() : tensor<32x1x128xf16>
- %38 = linalg.fill ins(%cst : f16) outs(%37 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16>
- %39 = linalg.batch_matmul ins(%35, %36 : tensor<32x1x?xf16>, tensor<32x?x128xf16>) outs(%38 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16>
- flow.dispatch.tensor.store %39, %30, offsets = [0, 0, 0], sizes = [32, 1, 128], strides = [1, 1, 1] : tensor<32x1x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<32x1x128xf16>>
- return
- }
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100"}>
+module {
+ func.func @dynamic_batch_matvec() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c32_i64 = arith.constant 32 : i64
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = hal.interface.constant.load[4] : i32
+ %5 = arith.index_castui %0 : i32 to index
+ %6 = arith.index_castui %1 : i32 to index
+ %7 = arith.index_castui %2 : i32 to index
+ %8 = arith.index_castui %3 : i32 to index
+ %9 = arith.index_castui %4 : i32 to index
+ %10 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%7) : !flow.dispatch.tensor<writeonly:tensor<32x1x128xf16>>
+ %11 = flow.dispatch.workload.ordinal %8, 0 : index
+ %12 = flow.dispatch.workload.ordinal %9, 1 : index
+ %13 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x1x?xf16>>{%11}
+ %14 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x?x128xf16>>{%12}
+ %15 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [32, 1, %11], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x1x?xf16>>{%11} -> tensor<32x1x?xf16>
+ %16 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0], sizes = [32, %12, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x?x128xf16>>{%12} -> tensor<32x?x128xf16>
+ %17 = tensor.empty() : tensor<32x1x128xf16>
+ %18 = linalg.fill ins(%cst : f16) outs(%17 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16>
+ %19 = linalg.batch_matmul ins(%15, %16 : tensor<32x1x?xf16>, tensor<32x?x128xf16>) outs(%18 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16>
+ flow.dispatch.tensor.store %19, %10, offsets = [0, 0, 0], sizes = [32, 1, 128], strides = [1, 1, 1] : tensor<32x1x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<32x1x128xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 1], [0, 0, 0, 32]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @dynamic_batch_matvec
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [32, 1, 1]>
// CHECK: func.func @dynamic_batch_matvec()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.batch_matmul
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-hal.executable @vmt {
-hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>) {
- hal.executable.export @vmt layout(#pipeline_layout)
- builtin.module {
- func.func @vmt() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
- %5 = tensor.empty() : tensor<1x32000xf16>
- %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16>
- %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) {
- ^bb0(%in: f16, %in_0: f16, %out: f16):
- %8 = arith.mulf %in, %in_0 : f16
- %9 = arith.addf %out, %8 : f16
- linalg.yield %9 : f16
- } -> tensor<1x32000xf16>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
- return
- }
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module {
+ func.func @vmt() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
+ %5 = tensor.empty() : tensor<1x32000xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %out: f16):
+ %8 = arith.mulf %in, %in_0 : f16
+ %9 = arith.addf %out, %8 : f16
+ linalg.yield %9 : f16
+ } -> tensor<1x32000xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 8], [0, 0, 512]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @vmt
-// CHECK-SAME: subgroup_size = 64 : index
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
// CHECK: func.func @vmt()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-hal.executable @vmt {
-hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100"}>) {
- hal.executable.export @vmt layout(#pipeline_layout)
- builtin.module {
- func.func @vmt() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
- %5 = tensor.empty() : tensor<1x32000xf16>
- %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16>
- %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) {
- ^bb0(%in: f16, %in_0: f16, %out: f16):
- %8 = arith.mulf %in, %in_0 : f16
- %9 = arith.addf %out, %8 : f16
- linalg.yield %9 : f16
- } -> tensor<1x32000xf16>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
- return
- }
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100"}>
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module {
+ func.func @vmt() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf16>> -> tensor<1x4096xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
+ %5 = tensor.empty() : tensor<1x32000xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %out: f16):
+ %8 = arith.mulf %in, %in_0 : f16
+ %9 = arith.addf %out, %8 : f16
+ linalg.yield %9 : f16
+ } -> tensor<1x32000xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x32000xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 8], [0, 0, 512]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @vmt
-// CHECK-SAME: subgroup_size = 32 : index
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 32>
// CHECK: func.func @vmt()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-hal.executable private @i4_dequant_matvec {
- hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>) {
- hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer, ReadOnly>, <4, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @i4_dequant_matvec() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
- %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
- %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>> -> tensor<4096x32x128xi4>
- %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
- %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
- %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
- %9 = tensor.empty() : tensor<4096xf16>
- %10 = tensor.empty() : tensor<4096x32x128xf16>
- %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16>
- %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) {
- ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16):
- %14 = arith.extui %in : i4 to i32
- %15 = arith.uitofp %14 : i32 to f16
- %16 = arith.subf %15, %in_1 : f16
- %17 = arith.mulf %16, %in_0 : f16
- linalg.yield %17 : f16
- } -> tensor<4096x32x128xf16>
- %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) {
- ^bb0(%in: f16, %in_0: f16, %out: f16):
- %14 = arith.mulf %in, %in_0 : f16
- %15 = arith.addf %14, %out : f16
- linalg.yield %15 : f16
- } -> tensor<4096xf16>
- flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
- return
- }
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0)>
+module {
+ func.func @i4_dequant_matvec() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
+ %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
+ %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>> -> tensor<4096x32x128xi4>
+ %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
+ %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32xf16>> -> tensor<4096x32xf16>
+ %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
+ %9 = tensor.empty() : tensor<4096xf16>
+ %10 = tensor.empty() : tensor<4096x32x128xf16>
+ %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16>
+ %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) {
+ ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16):
+ %14 = arith.extui %in : i4 to i32
+ %15 = arith.uitofp %14 : i32 to f16
+ %16 = arith.subf %15, %in_1 : f16
+ %17 = arith.mulf %16, %in_0 : f16
+ linalg.yield %17 : f16
+ } -> tensor<4096x32x128xf16>
+ %13 = linalg.generic {indexing_maps = [#map2, #map, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %out: f16):
+ %14 = arith.mulf %in, %in_0 : f16
+ %15 = arith.addf %14, %out : f16
+ linalg.yield %15 : f16
+ } -> tensor<4096xf16>
+ flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<4096xf16>>
+ return
}
}
// TODO: We should process multiple rows per subgroup.
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1], [0, 4, 128]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @i4_dequant_matvec
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
// CHECK: func.func @i4_dequant_matvec()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
// Send 2xNxK mmt to the warp reduction pipeline.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-hal.executable @skinny_mmt {
-hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>) {
- hal.executable.export @skinny_mmt layout(#pipeline_layout)
- builtin.module {
- func.func @skinny_mmt() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x4096xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x32000xf16>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x4096xf16>> -> tensor<2x4096xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
- %5 = tensor.empty() : tensor<2x32000xf16>
- %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2x32000xf16>) -> tensor<2x32000xf16>
- %7 = linalg.matmul_transpose_b ins(%3, %4 : tensor<2x4096xf16>, tensor<32000x4096xf16>)
- outs(%6 : tensor<2x32000xf16>) -> tensor<2x32000xf16>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2, 32000], strides = [1, 1] : tensor<2x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x32000xf16>>
- return
- }
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>
+module {
+ func.func @skinny_mmt() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x4096xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x32000xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x4096xf16>> -> tensor<2x4096xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
+ %5 = tensor.empty() : tensor<2x32000xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2x32000xf16>) -> tensor<2x32000xf16>
+ %7 = linalg.matmul_transpose_b ins(%3, %4 : tensor<2x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<2x32000xf16>) -> tensor<2x32000xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2, 32000], strides = [1, 1] : tensor<2x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x32000xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 512]{{\]}}>
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @skinny_mmt
-// CHECK-SAME: subgroup_size = 64 : index
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
// CHECK: func.func @skinny_mmt()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.matmul_transpose_b
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
// Send Mx2xK mmt to the warp reduction pipeline.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-hal.executable @skinny_mmt {
-hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>) {
- hal.executable.export @skinny_mmt layout(#pipeline_layout)
- builtin.module {
- func.func @skinny_mmt() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x4096xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32000x2xf16>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x4096xf16>> -> tensor<2x4096xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
- %5 = tensor.empty() : tensor<32000x2xf16>
- %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<32000x2xf16>) -> tensor<32000x2xf16>
- %7 = linalg.matmul_transpose_b ins(%4, %3 : tensor<32000x4096xf16>, tensor<2x4096xf16>)
- outs(%6 : tensor<32000x2xf16>) -> tensor<32000x2xf16>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32000, 2], strides = [1, 1] : tensor<32000x2xf16> -> !flow.dispatch.tensor<writeonly:tensor<32000x2xf16>>
- return
- }
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>
+module {
+ func.func @skinny_mmt() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x4096xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32000x2xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x4096xf16>> -> tensor<2x4096xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
+ %5 = tensor.empty() : tensor<32000x2xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<32000x2xf16>) -> tensor<32000x2xf16>
+ %7 = linalg.matmul_transpose_b ins(%4, %3 : tensor<32000x4096xf16>, tensor<2x4096xf16>) outs(%6 : tensor<32000x2xf16>) -> tensor<32000x2xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32000, 2], strides = [1, 1] : tensor<32000x2xf16> -> !flow.dispatch.tensor<writeonly:tensor<32000x2xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 512]{{\]}}>
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @skinny_mmt
-// CHECK-SAME: subgroup_size = 64 : index
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
// CHECK: func.func @skinny_mmt()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.matmul_transpose_b
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-hal.executable @not_vmt {
-hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>) {
- hal.executable.export @not_vmt layout(#pipeline_layout)
- builtin.module {
- func.func @not_vmt() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<5x4096xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<5x32000xf16>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [5, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<5x4096xf16>> -> tensor<5x4096xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
- %5 = tensor.empty() : tensor<5x32000xf16>
- %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<5x32000xf16>) -> tensor<5x32000xf16>
- %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
- affine_map<(d0, d1, d2) -> (d1, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel", "reduction"]}
- ins(%3, %4 : tensor<5x4096xf16>, tensor<32000x4096xf16>)
- outs(%6 : tensor<5x32000xf16>) {
- ^bb0(%in: f16, %in_0: f16, %out: f16):
- %8 = arith.mulf %in, %in_0 : f16
- %9 = arith.addf %out, %8 : f16
- linalg.yield %9 : f16
- } -> tensor<5x32000xf16>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [5, 32000], strides = [1, 1] : tensor<5x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<5x32000xf16>>
- return
- }
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module {
+ func.func @not_vmt() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<5x4096xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<5x32000xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [5, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<5x4096xf16>> -> tensor<5x4096xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32000x4096xf16>> -> tensor<32000x4096xf16>
+ %5 = tensor.empty() : tensor<5x32000xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<5x32000xf16>) -> tensor<5x32000xf16>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<5x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<5x32000xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %out: f16):
+ %8 = arith.mulf %in, %in_0 : f16
+ %9 = arith.addf %out, %8 : f16
+ linalg.yield %9 : f16
+ } -> tensor<5x32000xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [5, 32000], strides = [1, 1] : tensor<5x32000xf16> -> !flow.dispatch.tensor<writeonly:tensor<5x32000xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 128, 8]{{\]}}>
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-// CHECK-LABEL: hal.executable.export public @not_vmt
-// CHECK-SAME: subgroup_size = 64 : index
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [32, 1, 1] subgroup_size = 64, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
// CHECK: func.func @not_vmt()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
-
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test.mlir
index 68eab97..4e92cc0 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test.mlir
@@ -1,5 +1,5 @@
// RUN: iree-opt --split-input-file \
-// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target,canonicalize)))' \
+// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target,canonicalize)))))' \
// RUN: %s | FileCheck %s
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
@@ -111,8 +111,7 @@
#hal.interface.binding<0, 2>,
#hal.interface.binding<0, 3>
],
- translation_info = #iree_codegen.translation_info<LLVMGPUVectorize>,
- workgroup_size = [16 : index, 2 : index, 1 : index]} {
+ translation_info = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [16, 2, 1]>} {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
index ce5e33a..1359868 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
@@ -1,66 +1,43 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-tile-and-distribute)))))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-llvmgpu-tile-and-distribute))" %s | FileCheck %s
#config = #iree_codegen.lowering_config<tile_sizes = [[2, 256, 4]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-#map0 = affine_map<()[s0] -> (s0 * 2)>
+#map = affine_map<()[s0] -> (s0 * 2)>
#map1 = affine_map<()[s0] -> (s0 * 256)>
-#map2 = affine_map<(d0) -> (2, -d0 + 1024)>
-#map3 = affine_map<(d0) -> (256, -d0 + 1024)>
-#map4 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
-hal.executable private @dot_dispatch_0 {
- hal.executable.variant @cuda target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export @dot_dispatch_0 layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [64 : index, 1 : index, 1 : index]
- }
- builtin.module {
- func.func @dot_dispatch_0() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %c1024 = arith.constant 1024 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x1024xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<1024x1024xf32>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x1024xf32>
- %workgroup_size_x = hal.interface.workgroup.size[0] : index
- %workgroup_size_y = hal.interface.workgroup.size[1] : index
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %3 = affine.apply #map0()[%workgroup_id_y]
- %4 = affine.apply #map0()[%workgroup_count_y]
- scf.for %arg0 = %3 to %c1024 step %4 {
- %5 = affine.apply #map1()[%workgroup_id_x]
- %6 = affine.apply #map1()[%workgroup_count_x]
- scf.for %arg1 = %5 to %c1024 step %6 {
- %8 = memref.subview %0[%arg0, 0] [2, 1024] [1, 1]
- : memref<1024x1024xf32> to memref<2x1024xf32, #map4>
- %10 = memref.subview %1[0, %arg1] [1024, 256] [1, 1]
- : memref<1024x1024xf32> to memref<1024x256xf32, #map4>
- %11 = memref.subview %2[%arg0, %arg1] [2, 256] [1, 1]
- : memref<1024x1024xf32> to memref<2x256xf32, #map4>
- linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%11 : memref<2x256xf32, #map4>)
- linalg.matmul {lowering_config = #config}
- ins(%8, %10 : memref<2x1024xf32, #map4>, memref<1024x256xf32, #map4>)
- outs(%11 : memref<2x256xf32, #map4>)
- }
- }
- return
+#map2 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [64, 1, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+module {
+ func.func @dot_dispatch_0() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %c1024 = arith.constant 1024 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x1024xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<1024x1024xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x1024xf32>
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %3 = affine.apply #map()[%workgroup_id_y]
+ %4 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg0 = %3 to %c1024 step %4 {
+ %5 = affine.apply #map1()[%workgroup_id_x]
+ %6 = affine.apply #map1()[%workgroup_count_x]
+ scf.for %arg1 = %5 to %c1024 step %6 {
+ %subview = memref.subview %0[%arg0, 0] [2, 1024] [1, 1] : memref<1024x1024xf32> to memref<2x1024xf32, #map2>
+ %subview_0 = memref.subview %1[0, %arg1] [1024, 256] [1, 1] : memref<1024x1024xf32> to memref<1024x256xf32, #map2>
+ %subview_1 = memref.subview %2[%arg0, %arg1] [2, 256] [1, 1] : memref<1024x1024xf32> to memref<2x256xf32, #map2>
+ linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%subview_1 : memref<2x256xf32, #map2>)
+ linalg.matmul {lowering_config = #config} ins(%subview, %subview_0 : memref<2x1024xf32, #map2>, memref<1024x256xf32, #map2>) outs(%subview_1 : memref<2x256xf32, #map2>)
}
}
+ return
}
}
-// CHECK-LABEL: hal.executable private @dot_dispatch_0
-// CHECK: hal.executable.variant public @cuda
+// CHECK: func.func @dot_dispatch_0()
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
@@ -89,23 +66,16 @@
// -----
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#config = #iree_codegen.lowering_config<tile_sizes = [[1, 8, 32, 32]]>
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @batch_matmul_func {
- hal.executable.variant @cuda target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export @batch_matmul_func layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [8 : index, 8 : index, 1 : index]
- }
-builtin.module {
- func.func @batch_matmul_func() {
+#map = affine_map<()[s0] -> (s0 * 8)>
+#map1 = affine_map<()[s0] -> (s0 * 32)>
+#map2 = affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>
+#map3 = affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>
+#map4 = affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [8, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+module {
+ func.func @batch_matmul_func() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c4 = arith.constant 4 : index
@@ -124,29 +94,26 @@
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c4 step %workgroup_count_z {
- %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
- %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
+ %3 = affine.apply #map()[%workgroup_id_y]
+ %4 = affine.apply #map()[%workgroup_count_y]
scf.for %arg1 = %3 to %c32 step %4 {
- %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
- %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
+ %5 = affine.apply #map1()[%workgroup_id_x]
+ %6 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg2 = %5 to %c64 step %6 {
- %7 = memref.subview %0[%arg0, %arg1, 0] [1, 8, 1024] [1, 1, 1] : memref<4x32x1024xf32> to memref<1x8x1024xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>>
- %8 = memref.subview %1[%arg0, 0, %arg2] [1, 1024, 32] [1, 1, 1] : memref<4x1024x64xf32> to memref<1x1024x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>>
- %9 = memref.subview %2[%arg0, %arg1, %arg2] [1, 8, 32] [1, 1, 1] : memref<4x32x64xf32> to memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>
- linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 8, 32, 32]]>} ins(%cst : f32) outs(%9 : memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>)
- linalg.batch_matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 8, 32, 32]]>} ins(%7, %8 : memref<1x8x1024xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>>, memref<1x1024x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>>) outs(%9 : memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>)
+ %subview = memref.subview %0[%arg0, %arg1, 0] [1, 8, 1024] [1, 1, 1] : memref<4x32x1024xf32> to memref<1x8x1024xf32, #map2>
+ %subview_0 = memref.subview %1[%arg0, 0, %arg2] [1, 1024, 32] [1, 1, 1] : memref<4x1024x64xf32> to memref<1x1024x32xf32, #map3>
+ %subview_1 = memref.subview %2[%arg0, %arg1, %arg2] [1, 8, 32] [1, 1, 1] : memref<4x32x64xf32> to memref<1x8x32xf32, #map4>
+ linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%subview_1 : memref<1x8x32xf32, #map4>)
+ linalg.batch_matmul {lowering_config = #config} ins(%subview, %subview_0 : memref<1x8x1024xf32, #map2>, memref<1x1024x32xf32, #map3>) outs(%subview_1 : memref<1x8x32xf32, #map4>)
}
}
}
return
}
}
-}
-}
// CHECK: #[[$MAP:.*]] = affine_map<()[s0] -> (s0 * 4)>
-// CHECK-LABEL: hal.executable private @batch_matmul_func
-// CHECK: hal.executable.variant public @cuda
+// CHECK: func.func @batch_matmul_func()
// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index
// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index
// CHECK-DAG: %[[TX:.*]] = gpu.thread_id x
@@ -169,66 +136,43 @@
// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[2, 32, 4]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-#map0 = affine_map<()[s0] -> (s0 * 2)>
+#map = affine_map<()[s0] -> (s0 * 2)>
#map1 = affine_map<()[s0] -> (s0 * 32)>
-#map2 = affine_map<(d0) -> (2, -d0 + 1024)>
-#map3 = affine_map<(d0) -> (32, -d0 + 1024)>
-#map4 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
-hal.executable private @dot_dispatch_0 {
- hal.executable.variant @cuda target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export @dot_dispatch_0 layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [64 : index, 8 : index, 1 : index]
- }
- builtin.module {
- func.func @dot_dispatch_0() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %c1024 = arith.constant 1024 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x1024xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<1024x1024xf32>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x1024xf32>
- %workgroup_size_x = hal.interface.workgroup.size[0] : index
- %workgroup_size_y = hal.interface.workgroup.size[1] : index
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %3 = affine.apply #map0()[%workgroup_id_y]
- %4 = affine.apply #map0()[%workgroup_count_y]
- scf.for %arg0 = %3 to %c1024 step %4 {
- %5 = affine.apply #map1()[%workgroup_id_x]
- %6 = affine.apply #map1()[%workgroup_count_x]
- scf.for %arg1 = %5 to %c1024 step %6 {
- %8 = memref.subview %0[%arg0, 0] [2, 1024] [1, 1]
- : memref<1024x1024xf32> to memref<2x1024xf32, #map4>
- %10 = memref.subview %1[0, %arg1] [1024, 32] [1, 1]
- : memref<1024x1024xf32> to memref<1024x32xf32, #map4>
- %11 = memref.subview %2[%arg0, %arg1] [2, 32] [1, 1]
- : memref<1024x1024xf32> to memref<2x32xf32, #map4>
- linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%11 : memref<2x32xf32, #map4>)
- linalg.matmul {lowering_config = #config}
- ins(%8, %10 : memref<2x1024xf32, #map4>, memref<1024x32xf32, #map4>)
- outs(%11 : memref<2x32xf32, #map4>)
- }
- }
- return
+#map2 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [64, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+module {
+ func.func @dot_dispatch_0() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %c1024 = arith.constant 1024 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x1024xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<1024x1024xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x1024xf32>
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %3 = affine.apply #map()[%workgroup_id_y]
+ %4 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg0 = %3 to %c1024 step %4 {
+ %5 = affine.apply #map1()[%workgroup_id_x]
+ %6 = affine.apply #map1()[%workgroup_count_x]
+ scf.for %arg1 = %5 to %c1024 step %6 {
+ %subview = memref.subview %0[%arg0, 0] [2, 1024] [1, 1] : memref<1024x1024xf32> to memref<2x1024xf32, #map2>
+ %subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32> to memref<1024x32xf32, #map2>
+ %subview_1 = memref.subview %2[%arg0, %arg1] [2, 32] [1, 1] : memref<1024x1024xf32> to memref<2x32xf32, #map2>
+ linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%subview_1 : memref<2x32xf32, #map2>)
+ linalg.matmul {lowering_config = #config} ins(%subview, %subview_0 : memref<2x1024xf32, #map2>, memref<1024x32xf32, #map2>) outs(%subview_1 : memref<2x32xf32, #map2>)
}
}
+ return
}
}
-// CHECK-LABEL: hal.executable private @dot_dispatch_0
-// CHECK: hal.executable.variant public @cuda
+// CHECK: func.func @dot_dispatch_0()
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
@@ -257,48 +201,35 @@
// CHECK: memref.copy {{.*}}, {{.*}} {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<2x32xf32, #gpu.address_space<workgroup>> to memref<2x32xf32
// CHECK: gpu.barrier
-
// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[]]>
-#translation = #iree_codegen.translation_info<LLVMGPUVectorize>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-// Pure reducion case, skip tiling.
-hal.executable @reduction_dispatch {
- hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @predict_dispatch_153 layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [1: index, 1: index, 1: index]
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#map = affine_map<(d0) -> (d0)>
+#map1 = affine_map<(d0) -> ()>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [1, 1, 1]>
+module {
+ func.func @predict_dispatch_153() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0x7FC00000 : f32
+ %cst_0 = arith.constant 0xFF800000 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1000xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<f32>
+ linalg.fill {lowering_config = #config} ins(%cst_0 : f32) outs(%1 : memref<f32>)
+ linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction"]} ins(%0 : memref<1000xf32>) outs(%1 : memref<f32>) attrs = {lowering_config = #config} {
+ ^bb0(%in: f32, %out: f32):
+ %2 = arith.cmpf ogt, %in, %out : f32
+ %3 = arith.select %2, %in, %out : f32
+ %4 = arith.cmpf uno, %in, %out : f32
+ %5 = arith.select %4, %cst, %3 : f32
+ linalg.yield %5 : f32
}
- builtin.module {
- func.func @predict_dispatch_153() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0x7FC00000 : f32
- %cst_0 = arith.constant 0xFF800000 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1000xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<f32>
- linalg.fill {lowering_config = #config} ins(%cst_0 : f32) outs(%1 : memref<f32>)
- linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%0 : memref<1000xf32>) outs(%1 : memref<f32>) attrs = {lowering_config = #config} {
- ^bb0(%arg0: f32, %arg1: f32): // no predecessors
- %2 = arith.cmpf ogt, %arg0, %arg1 : f32
- %3 = arith.select %2, %arg0, %arg1 : f32
- %4 = arith.cmpf uno, %arg0, %arg1 : f32
- %5 = arith.select %4, %cst, %3 : f32
- linalg.yield %5 : f32
- }
- return
- }
- }
+ return
}
}
// CHECK: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[]{{\]}}>
-// CHECK: hal.executable public @reduction_dispatch
+// CHECK: func.func @predict_dispatch_153()
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG]]
// CHECK: linalg.generic
@@ -307,58 +238,48 @@
// -----
-#translation = #iree_codegen.translation_info<LLVMGPUVectorize>
+#config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 256, 4, 4, 4]]>
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @conv_dispatch {
- hal.executable.variant @cuda target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export @conv_dispatch layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [64 : index, 1 : index, 1 : index]
- }
- builtin.module {
- func.func @conv_dispatch() {
- %c56 = arith.constant 56 : index
- %c64 = arith.constant 64 : index
- %c802816 = arith.constant 802816 : index
- %c41664 = arith.constant 41664 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<1x64x56x56xf32>
- memref.assume_alignment %0, 64 : memref<1x64x56x56xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c41664) : memref<64x64x1x1xf32>
- memref.assume_alignment %1, 64 : memref<64x64x1x1xf32>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c802816) : memref<1x64x56x56xf32>
- memref.assume_alignment %2, 64 : memref<1x64x56x56xf32>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %workgroup_id_z = hal.interface.workgroup.id[2] : index
- %workgroup_count_z = hal.interface.workgroup.count[2] : index
- scf.for %arg0 = %workgroup_id_z to %c64 step %workgroup_count_z {
- scf.for %arg1 = %workgroup_id_y to %c56 step %workgroup_count_y {
- %3 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_id_x]
- %4 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_count_x]
- scf.for %arg2 = %3 to %c56 step %4 {
- %5 = affine.min affine_map<(d0) -> (256, -d0 + 56)>(%arg2)
- %6 = memref.subview %0[0, 0, %arg1, %arg2] [1, 64, 1, %5] [1, 1, 1, 1] : memref<1x64x56x56xf32> to memref<1x64x1x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 200704 + s0 + d1 * 3136 + d2 * 56 + d3)>>
- %7 = memref.subview %1[%arg0, 0, 0, 0] [1, 64, 1, 1] [1, 1, 1, 1] : memref<64x64x1x1xf32> to memref<1x64x1x1xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 64 + s0 + d1 + d2 + d3)>>
- %8 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, 1, 1, %5] [1, 1, 1, 1] : memref<1x64x56x56xf32> to memref<1x1x1x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 200704 + s0 + d1 * 3136 + d2 * 56 + d3)>>
- linalg.fill{lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 256, 4, 4, 4]]>} ins(%cst : f32) outs(%8 : memref<1x1x1x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 200704 + s0 + d1 * 3136 + d2 * 56 + d3)>>)
- linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 1, 256, 4, 4, 4]]>, strides = dense<1> : vector<2xi64>} ins(%6, %7 : memref<1x64x1x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 200704 + s0 + d1 * 3136 + d2 * 56 + d3)>>, memref<1x64x1x1xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 64 + s0 + d1 + d2 + d3)>>) outs(%8 : memref<1x1x1x?xf32, affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 200704 + s0 + d1 * 3136 + d2 * 56 + d3)>>)
- }
- }
+#map = affine_map<()[s0] -> (s0 * 256)>
+#map1 = affine_map<(d0) -> (256, -d0 + 56)>
+#map2 = affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 200704 + s0 + d1 * 3136 + d2 * 56 + d3)>
+#map3 = affine_map<(d0, d1, d2, d3)[s0] -> (d0 * 64 + s0 + d1 + d2 + d3)>
+#translation = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1]>
+module {
+ func.func @conv_dispatch() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c56 = arith.constant 56 : index
+ %c64 = arith.constant 64 : index
+ %c802816 = arith.constant 802816 : index
+ %c41664 = arith.constant 41664 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<1x64x56x56xf32>
+ memref.assume_alignment %0, 64 : memref<1x64x56x56xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c41664) : memref<64x64x1x1xf32>
+ memref.assume_alignment %1, 64 : memref<64x64x1x1xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c802816) : memref<1x64x56x56xf32>
+ memref.assume_alignment %2, 64 : memref<1x64x56x56xf32>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %workgroup_id_z = hal.interface.workgroup.id[2] : index
+ %workgroup_count_z = hal.interface.workgroup.count[2] : index
+ scf.for %arg0 = %workgroup_id_z to %c64 step %workgroup_count_z {
+ scf.for %arg1 = %workgroup_id_y to %c56 step %workgroup_count_y {
+ %3 = affine.apply #map()[%workgroup_id_x]
+ %4 = affine.apply #map()[%workgroup_count_x]
+ scf.for %arg2 = %3 to %c56 step %4 {
+ %5 = affine.min #map1(%arg2)
+ %subview = memref.subview %0[0, 0, %arg1, %arg2] [1, 64, 1, %5] [1, 1, 1, 1] : memref<1x64x56x56xf32> to memref<1x64x1x?xf32, #map2>
+ %subview_0 = memref.subview %1[%arg0, 0, 0, 0] [1, 64, 1, 1] [1, 1, 1, 1] : memref<64x64x1x1xf32> to memref<1x64x1x1xf32, #map3>
+ %subview_1 = memref.subview %2[0, %arg0, %arg1, %arg2] [1, 1, 1, %5] [1, 1, 1, 1] : memref<1x64x56x56xf32> to memref<1x1x1x?xf32, #map2>
+ linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%subview_1 : memref<1x1x1x?xf32, #map2>)
+ linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, lowering_config = #config, strides = dense<1> : vector<2xi64>} ins(%subview, %subview_0 : memref<1x64x1x?xf32, #map2>, memref<1x64x1x1xf32, #map3>) outs(%subview_1 : memref<1x1x1x?xf32, #map2>)
}
- return
}
}
+ return
}
}
@@ -372,68 +293,60 @@
// CHECK: scf.for
// CHECK: linalg.conv_2d_nchw_fchw
-
// -----
-// Check contract-4d, we currently emit suboptimal code as we don't distribute
-// more than 3 dimensions but make sure we emit correct code.
#config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 2, 256, 4]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [64, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @contract_4d {
- hal.executable.variant @cuda target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export @contract_4d layout(#pipeline_layout) attributes {
- workgroup_size = [64 : index, 8 : index, 1 : index]
- }
- builtin.module {
- func.func @contract_4d() {
- %c12 = arith.constant 12 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 8.000000e+00 : f32
- %cst_0 = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %6 = arith.index_cast %0 : i32 to index
- %12 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) : memref<?x?x12x64xf32>{%6, %6}
- %13 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) : memref<?x?x12x64xf32>{%6, %6}
- %15 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<?x12x?x?xf32>{%6, %6, %6}
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %workgroup_id_z = hal.interface.workgroup.id[2] : index
- %workgroup_count_z = hal.interface.workgroup.count[2] : index
- scf.for %arg0 = %workgroup_id_z to %c12 step %workgroup_count_z {
- %16 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%workgroup_id_y]
- %17 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%workgroup_count_y]
- scf.for %arg1 = %16 to %6 step %17 {
- %18 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_id_x]
- %19 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_count_x]
- scf.for %arg2 = %18 to %6 step %19 {
- %20 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 2)>(%arg1)[%6]
- %21 = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 256)>(%arg2)[%6]
- %22 = memref.subview %15[0, %arg0, %arg1, %arg2] [%6, 1, %20, %21] [1, 1, 1, 1] : memref<?x12x?x?xf32> to memref<?x1x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3)>>
- %24 = memref.subview %12[0, %arg1, %arg0, 0] [%6, %20, 1, 64] [1, 1, 1, 1] : memref<?x?x12x64xf32> to memref<?x?x1x64xf32, affine_map<(d0, d1, d2, d3)[s0, s1] -> (d0 * s1 + s0 + d1 * 768 + d2 * 64 + d3)>>
- %25 = memref.subview %13[0, %arg2, %arg0, 0] [%6, %21, 1, 64] [1, 1, 1, 1] : memref<?x?x12x64xf32> to memref<?x?x1x64xf32, affine_map<(d0, d1, d2, d3)[s0, s1] -> (d0 * s1 + s0 + d1 * 768 + d2 * 64 + d3)>>
- linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 2, 256, 4]]>} ins(%cst_0 : f32) outs(%22 : memref<?x1x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3)>>)
- linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d1, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d1, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%24, %25 : memref<?x?x1x64xf32, affine_map<(d0, d1, d2, d3)[s0, s1] -> (d0 * s1 + s0 + d1 * 768 + d2 * 64 + d3)>>, memref<?x?x1x64xf32, affine_map<(d0, d1, d2, d3)[s0, s1] -> (d0 * s1 + s0 + d1 * 768 + d2 * 64 + d3)>>) outs(%22 : memref<?x1x?x?xf32, affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3)>>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 2, 256, 4]]>} {
- ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
- %26 = arith.mulf %arg3, %arg4 : f32
- %27 = arith.addf %26, %arg5 : f32
- linalg.yield %27 : f32
- }
- }
+#map = affine_map<()[s0] -> (s0 * 2)>
+#map1 = affine_map<()[s0] -> (s0 * 256)>
+#map2 = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
+#map3 = affine_map<(d0)[s0] -> (-d0 + s0, 256)>
+#map4 = affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3)>
+#map5 = affine_map<(d0, d1, d2, d3)[s0, s1] -> (d0 * s1 + s0 + d1 * 768 + d2 * 64 + d3)>
+#map6 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d1, d4)>
+#map7 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d1, d4)>
+#map8 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>
+module {
+ func.func @contract_4d() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c12 = arith.constant 12 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 8.000000e+00 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = arith.index_cast %0 : i32 to index
+ %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%1) : memref<?x?x12x64xf32>{%1, %1}
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%1) : memref<?x?x12x64xf32>{%1, %1}
+ %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<?x12x?x?xf32>{%1, %1, %1}
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %workgroup_id_z = hal.interface.workgroup.id[2] : index
+ %workgroup_count_z = hal.interface.workgroup.count[2] : index
+ scf.for %arg0 = %workgroup_id_z to %c12 step %workgroup_count_z {
+ %5 = affine.apply #map()[%workgroup_id_y]
+ %6 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg1 = %5 to %1 step %6 {
+ %7 = affine.apply #map1()[%workgroup_id_x]
+ %8 = affine.apply #map1()[%workgroup_count_x]
+ scf.for %arg2 = %7 to %1 step %8 {
+ %9 = affine.min #map2(%arg1)[%1]
+ %10 = affine.min #map3(%arg2)[%1]
+ %subview = memref.subview %4[0, %arg0, %arg1, %arg2] [%1, 1, %9, %10] [1, 1, 1, 1] : memref<?x12x?x?xf32> to memref<?x1x?x?xf32, #map4>
+ %subview_1 = memref.subview %2[0, %arg1, %arg0, 0] [%1, %9, 1, 64] [1, 1, 1, 1] : memref<?x?x12x64xf32> to memref<?x?x1x64xf32, #map5>
+ %subview_2 = memref.subview %3[0, %arg2, %arg0, 0] [%1, %10, 1, 64] [1, 1, 1, 1] : memref<?x?x12x64xf32> to memref<?x?x1x64xf32, #map5>
+ linalg.fill {lowering_config = #config} ins(%cst_0 : f32) outs(%subview : memref<?x1x?x?xf32, #map4>)
+ linalg.generic {indexing_maps = [#map6, #map7, #map8], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%subview_1, %subview_2 : memref<?x?x1x64xf32, #map5>, memref<?x?x1x64xf32, #map5>) outs(%subview : memref<?x1x?x?xf32, #map4>) attrs = {lowering_config = #config} {
+ ^bb0(%in: f32, %in_3: f32, %out: f32):
+ %11 = arith.mulf %in, %in_3 : f32
+ %12 = arith.addf %11, %out : f32
+ linalg.yield %12 : f32
}
}
- return
}
}
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/elementwise_pipeline.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/elementwise_pipeline.mlir
index 65741cc..5b2cca6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/elementwise_pipeline.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/elementwise_pipeline.mlir
@@ -1,35 +1,25 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s
-hal.executable @warp_reduction_dispatch {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>) {
- hal.executable.export public @forward_dispatch_0_generic_320x320x3x3 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @forward_dispatch_0_generic_320x320x3x3() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3x320x320x3xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<320x320x3x3xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [3, 320, 320, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x320x320x3xf32>> -> tensor<3x320x320x3xf32>
- %3 = tensor.empty() : tensor<320x320x3x3xf32>
- %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d2, d1, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<3x320x320x3xf32>) outs(%3 : tensor<320x320x3x3xf32>) {
- ^bb0(%in: f32, %out: f32):
- %5 = arith.addf %in, %cst : f32
- linalg.yield %5 : f32
- } -> tensor<320x320x3x3xf32>
- flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [320, 320, 3, 3], strides = [1, 1, 1, 1] : tensor<320x320x3x3xf32> -> !flow.dispatch.tensor<writeonly:tensor<320x320x3x3xf32>>
- return
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
+#map = affine_map<(d0, d1, d2, d3) -> (d2, d1, d0, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+module {
+ func.func @forward_dispatch_0_generic_320x320x3x3() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3x320x320x3xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<320x320x3x3xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [3, 320, 320, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x320x320x3xf32>> -> tensor<3x320x320x3xf32>
+ %3 = tensor.empty() : tensor<320x320x3x3xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<3x320x320x3xf32>) outs(%3 : tensor<320x320x3x3xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %5 = arith.addf %in, %cst : f32
+ linalg.yield %5 : f32
+ } -> tensor<320x320x3x3xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [320, 320, 3, 3], strides = [1, 1, 1, 1] : tensor<320x320x3x3xf32> -> !flow.dispatch.tensor<writeonly:tensor<320x320x3x3xf32>>
+ return
}
}
-}
-
-// CHECK-LABEL: hal.executable.export public @forward_dispatch_0_generic_320x320x3x3
-// CHECK: workgroup_size = [3 : index, 3 : index, 7 : index]}
-// CHECK-DAG: %[[C46:.+]] = arith.constant 46 : index
-// CHECK-DAG: %[[C320:.+]] = arith.constant 320 : index
-// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
-// CHECK: hal.return %[[C46]], %[[C320]], %[[C1]] : index, index, index
+// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [3, 3, 7] subgroup_size = 32>
+// CHECK: func.func @forward_dispatch_0_generic_320x320x3x3()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_generalize_named_ops.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_generalize_named_ops.mlir
index 7c9cc57..02d8575 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_generalize_named_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_generalize_named_ops.mlir
@@ -1,7 +1,7 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-llvmgpu-configuration-pipeline)))" \
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline)" \
// RUN: --split-input-file %s | FileCheck %s
-// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-rocdl-configuration-pipeline)))" \
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-codegen-rocdl-configuration-pipeline)" \
// RUN: --split-input-file %s | FileCheck %s
// Make sure that the GPU configuration pipelines generalize named ops, e.g., linalg.matmul_transpose_b to linalg.generic.
@@ -10,31 +10,23 @@
// CHECK-NEXT: linalg.generic
// CHECK-NOT: linalg.matmul_transpose_b
-hal.executable public @main_dispatch_517 {
- hal.executable.variant public @rocm target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx902"}>) {
- hal.executable.export public @warp_reduction_large_vector ordinal(0) layout(
- #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @warp_reduction_large_vector() {
- %cst = arith.constant 0.000000e+00 : f32
- %c128 = arith.constant 128 : index
- %c0 = arith.constant 0 : index
- %c394240 = arith.constant 394240 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c128) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1280xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1280x1280xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c394240) : !flow.dispatch.tensor<writeonly:tensor<1x1280xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1280xf32>> -> tensor<1x1280xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1280, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1280x1280xf32>> -> tensor<1280x1280xf32>
- %5 = tensor.empty() : tensor<1x1280xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x1280xf32>) -> tensor<1x1280xf32>
- %7 = linalg.matmul_transpose_b ins(%3, %4 : tensor<1x1280xf32>, tensor<1280x1280xf32>) outs(%6 : tensor<1x1280xf32>) -> tensor<1x1280xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 1280], strides = [1, 1] : tensor<1x1280xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x1280xf32>>
- return
- }
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx902"}>
+module {
+ func.func @warp_reduction_large_vector() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c128 = arith.constant 128 : index
+ %c0 = arith.constant 0 : index
+ %c394240 = arith.constant 394240 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c128) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1280xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1280x1280xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c394240) : !flow.dispatch.tensor<writeonly:tensor<1x1280xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1280xf32>> -> tensor<1x1280xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1280, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1280x1280xf32>> -> tensor<1280x1280xf32>
+ %5 = tensor.empty() : tensor<1x1280xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x1280xf32>) -> tensor<1x1280xf32>
+ %7 = linalg.matmul_transpose_b ins(%3, %4 : tensor<1x1280xf32>, tensor<1280x1280xf32>) outs(%6 : tensor<1x1280xf32>) -> tensor<1x1280xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 1280], strides = [1, 1] : tensor<1x1280xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x1280xf32>>
+ return
}
}
+
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
index 4227e0d..c096c9f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
@@ -1,165 +1,116 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-llvmgpu-configuration-pipeline)))" \
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline)" \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false %s | FileCheck %s
// Transform dialect attributes are tested separately.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @add_dispatch_0 {
- hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @add_dispatch_0 layout(#pipeline_layout)
- builtin.module {
- func.func @add_dispatch_0() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<16384xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<16384xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<16384xf32>>
- %3 = tensor.empty() : tensor<16384xf32>
- %4 = flow.dispatch.tensor.load %0, offsets=[0], sizes=[16384], strides=[1] : !flow.dispatch.tensor<readonly:tensor<16384xf32>> -> tensor<16384xf32>
- %5 = flow.dispatch.tensor.load %1, offsets=[0], sizes=[16384], strides=[1] : !flow.dispatch.tensor<readonly:tensor<16384xf32>> -> tensor<16384xf32>
- %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4, %5 : tensor<16384xf32>, tensor<16384xf32>) outs(%3 : tensor<16384xf32>) {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
- %7 = arith.addf %arg0, %arg1 : f32
- linalg.yield %7 : f32
- } -> tensor<16384xf32>
- flow.dispatch.tensor.store %6, %2, offsets=[0], sizes=[16384], strides=[1] : tensor<16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<16384xf32>>
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#map = affine_map<(d0) -> (d0)>
+module {
+ func.func @add_dispatch_0() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<16384xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<16384xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<16384xf32>>
+ %3 = tensor.empty() : tensor<16384xf32>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [16384], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16384xf32>> -> tensor<16384xf32>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [16384], strides = [1] : !flow.dispatch.tensor<readonly:tensor<16384xf32>> -> tensor<16384xf32>
+ %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%4, %5 : tensor<16384xf32>, tensor<16384xf32>) outs(%3 : tensor<16384xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %7 = arith.addf %in, %in_0 : f32
+ linalg.yield %7 : f32
+ } -> tensor<16384xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [16384], strides = [1] : tensor<16384xf32> -> !flow.dispatch.tensor<writeonly:tensor<16384xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[256]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorize>
-// CHECK: hal.executable.export public @add_dispatch_0
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
+// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1] subgroup_size = 32>
// CHECK: func.func @add_dispatch_0
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @dot_dispatch_1 {
- hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @dot_dispatch_1 layout(#pipeline_layout)
- builtin.module {
- func.func @dot_dispatch_1() {
- %c0 = arith.constant 0 : index
- %c4 = arith.constant 4 : index
- %c2 = arith.constant 2 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2x3xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<3x4xf32>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<2x4xf32>
- linalg.fill ins(%cst : f32) outs(%2 : memref<2x4xf32>)
- linalg.matmul ins(%0, %1 : memref<2x3xf32>, memref<3x4xf32>) outs(%2 : memref<2x4xf32>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+module {
+ func.func @dot_dispatch_1() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %c4 = arith.constant 4 : index
+ %c2 = arith.constant 2 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2x3xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<3x4xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<2x4xf32>
+ linalg.fill ins(%cst : f32) outs(%2 : memref<2x4xf32>)
+ linalg.matmul ins(%0, %1 : memref<2x3xf32>, memref<3x4xf32>) outs(%2 : memref<2x4xf32>)
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 2, 4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-// CHECK: hal.executable.export public @dot_dispatch_1
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [2 : index, 4 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [2, 4, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
// CHECK: func.func @dot_dispatch_1
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @unaligned_k {
- hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @unaligned_k layout(#pipeline_layout)
- builtin.module {
- func.func @unaligned_k() {
- %c0 = arith.constant 0 : index
- %c4 = arith.constant 4 : index
- %c2 = arith.constant 2 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<128x258xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<258x64xf32>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<128x64xf32>
- linalg.fill ins(%cst : f32) outs(%2 : memref<128x64xf32>)
- linalg.matmul ins(%0, %1 : memref<128x258xf32>, memref<258x64xf32>) outs(%2 : memref<128x64xf32>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+module {
+ func.func @unaligned_k() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %c4 = arith.constant 4 : index
+ %c2 = arith.constant 2 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<128x258xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<258x64xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<128x64xf32>
+ linalg.fill ins(%cst : f32) outs(%2 : memref<128x64xf32>)
+ linalg.matmul ins(%0, %1 : memref<128x258xf32>, memref<258x64xf32>) outs(%2 : memref<128x64xf32>)
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 128, 2]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-// CHECK: hal.executable.export public @unaligned_k
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 8 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
// CHECK: func.func @unaligned_k
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
-
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @reduction_dispatch {
- hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @predict_dispatch_153 layout(#pipeline_layout)
- builtin.module {
- func.func @predict_dispatch_153() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0x7FC00000 : f32
- %cst_0 = arith.constant 0xFF800000 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1000xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<f32>
- linalg.fill ins(%cst_0 : f32) outs(%1 : memref<f32>)
- linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%0 : memref<1000xf32>) outs(%1 : memref<f32>) {
- ^bb0(%arg0: f32, %arg1: f32): // no predecessors
- %2 = arith.cmpf ogt, %arg0, %arg1 : f32
- %3 = arith.select %2, %arg0, %arg1 : f32
- %4 = arith.cmpf uno, %arg0, %arg1 : f32
- %5 = arith.select %4, %cst, %3 : f32
- linalg.yield %5 : f32
- }
- return
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#map = affine_map<(d0) -> (d0)>
+#map1 = affine_map<(d0) -> ()>
+module {
+ func.func @predict_dispatch_153() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0x7FC00000 : f32
+ %cst_0 = arith.constant 0xFF800000 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1000xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<f32>
+ linalg.fill ins(%cst_0 : f32) outs(%1 : memref<f32>)
+ linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction"]} ins(%0 : memref<1000xf32>) outs(%1 : memref<f32>) {
+ ^bb0(%in: f32, %out: f32):
+ %2 = arith.cmpf ogt, %in, %out : f32
+ %3 = arith.select %2, %in, %out : f32
+ %4 = arith.cmpf uno, %in, %out : f32
+ %5 = arith.select %4, %cst, %3 : f32
+ linalg.yield %5 : f32
}
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDistribute>
-// CHECK: hal.executable.export public @predict_dispatch_153
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDistribute workgroup_size = [1, 1, 1]>
+// CHECK: func.func @predict_dispatch_153()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -167,44 +118,31 @@
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @reduction_aligned2 {
- hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export public @reduction_aligned2 ordinal(0) layout(#pipeline_layout)
- builtin.module {
- func.func @reduction_aligned2() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4x128x384xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x384xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 128, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x128x384xf32>> -> tensor<4x128x384xf32>
- %3 = tensor.empty() : tensor<128x384xf32>
- %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<128x384xf32>) -> tensor<128x384xf32>
- %5 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2) -> (d2, d0, d1)>,
- affine_map<(d0, d1, d2) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel", "reduction"]}
- ins(%2 : tensor<4x128x384xf32>) outs(%4 : tensor<128x384xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %6 = arith.addf %arg0, %arg1 : f32
- linalg.yield %6 : f32
- } -> tensor<128x384xf32>
- flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : tensor<128x384xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x384xf32>>
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#map = affine_map<(d0, d1, d2) -> (d2, d0, d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module {
+ func.func @reduction_aligned2() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4x128x384xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x384xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 128, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x128x384xf32>> -> tensor<4x128x384xf32>
+ %3 = tensor.empty() : tensor<128x384xf32>
+ %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<128x384xf32>) -> tensor<128x384xf32>
+ %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<4x128x384xf32>) outs(%4 : tensor<128x384xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %6 = arith.addf %in, %out : f32
+ linalg.yield %6 : f32
+ } -> tensor<128x384xf32>
+ flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : tensor<128x384xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x384xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 128, 4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorize>
-// CHECK: hal.executable.export public @reduction_aligned2
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1] subgroup_size = 32>
+// CHECK: func.func @reduction_aligned2()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -212,163 +150,114 @@
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @copy_as_generic {
- hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @copy_as_generic layout(#pipeline_layout)
- builtin.module {
- func.func @copy_as_generic() {
- %c0 = arith.constant 0 : index
- %d0 = hal.interface.constant.load[0] : index
- %d1 = hal.interface.constant.load[1] : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<?x?xi32>{%d0, %d1}
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<?x?xi32>{%d0, %d1}
- linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]}
- ins(%0 : memref<?x?xi32>) outs(%1 : memref<?x?xi32>) {
- ^bb0(%arg4: i32, %s: i32): // no predecessors
- linalg.yield %arg4 : i32
- }
- return
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @copy_as_generic() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<?x?xi32>{%0, %1}
+ %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<?x?xi32>{%0, %1}
+ linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<?x?xi32>) outs(%3 : memref<?x?xi32>) {
+ ^bb0(%in: i32, %out: i32):
+ linalg.yield %in : i32
}
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDistribute>
-// CHECK: hal.executable.export public @copy_as_generic
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDistribute workgroup_size = [64, 1, 1] subgroup_size = 32>
+// CHECK: func.func @copy_as_generic()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @static_1d_fft_stage2 {
- hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @static_1d_fft_stage2 layout(#pipeline_layout)
- builtin.module {
- func.func @static_1d_fft_stage2() {
- %c0 = arith.constant 0 : index
- %c2 = arith.constant 2 : index
- %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32>
- %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32>
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
- %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
- %4:2 = iree_linalg_ext.fft {__internal_linalg_transform__ = "workgroup"} ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32>
- flow.dispatch.tensor.store %4#0, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- flow.dispatch.tensor.store %4#1, %1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+module {
+ func.func @static_1d_fft_stage2() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %c2 = arith.constant 2 : index
+ %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32>
+ %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32>
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
+ %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
+ %4:2 = iree_linalg_ext.fft {__internal_linalg_transform__ = "workgroup"} ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32>
+ flow.dispatch.tensor.store %4#0, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ flow.dispatch.tensor.store %4#1, %1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDistribute>
-// CHECK: hal.executable.export public @static_1d_fft_stage2
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDistribute workgroup_size = [32, 1, 1]>
+// CHECK: func.func @static_1d_fft_stage2()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: iree_linalg_ext.fft
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @static_3d_fft_stage3 {
- hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @static_3d_fft_stage3 layout(#pipeline_layout)
- builtin.module {
- func.func @static_3d_fft_stage3() {
- %c0 = arith.constant 0 : index
- %c3 = arith.constant 3 : index
- %c64 = arith.constant 64 : index
- %c128 = arith.constant 128 : index
- %c32 = arith.constant 32 : index
- %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32>
- %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32>
- %0 = bufferization.to_memref %cst_0 : memref<4xf32>
- %1 = bufferization.to_memref %cst : memref<4xf32>
- %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x128x32xf32>
- %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<64x128x32xf32>
- iree_linalg_ext.fft {__internal_linalg_transform__ = "workgroup"}
- ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>)
- outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+module {
+ func.func @static_3d_fft_stage3() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %c3 = arith.constant 3 : index
+ %c64 = arith.constant 64 : index
+ %c128 = arith.constant 128 : index
+ %c32 = arith.constant 32 : index
+ %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32>
+ %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32>
+ %0 = bufferization.to_memref %cst_0 : memref<4xf32>
+ %1 = bufferization.to_memref %cst : memref<4xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x128x32xf32>
+ %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<64x128x32xf32>
+ iree_linalg_ext.fft {__internal_linalg_transform__ = "workgroup"} ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>) outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>)
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 8]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDistribute>
-// CHECK: hal.executable.export public @static_3d_fft_stage3
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDistribute workgroup_size = [32, 1, 1]>
+// CHECK: func.func @static_3d_fft_stage3()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: iree_linalg_ext.fft
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[32, 128, 64]]>,
- translation_info = <LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>,
- workgroup_size = [16, 8, 1]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @user_config {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export public @_lowering_config_test_dispatch_1 layout(#pipeline_layout)
- builtin.module {
- func.func @_lowering_config_test_dispatch_1() {
- %cst = arith.constant 0.000000e+00 : f32
- %c128 = arith.constant 128 : index
- %c1024 = arith.constant 1024 : index
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>> -> tensor<256x1024xf32>
- %15 = tensor.empty() : tensor<128x1024xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
- %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup", compilation_info = #compilation}
- ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%16 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
- return
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 64]]>
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [16, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @_lowering_config_test_dispatch_1() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c128 = arith.constant 128 : index
+ %c1024 = arith.constant 1024 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>> -> tensor<256x1024xf32>
+ %5 = tensor.empty() : tensor<128x1024xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
+ %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup", compilation_info = #compilation} ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
+ return
}
}
-}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 128, 64]{{\]}}
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-// CHECK: hal.executable.export public @_lowering_config_test_dispatch_1
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [16, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+// CHECK: func.func @_lowering_config_test_dispatch_1()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [16 : index, 8 : index, 1 : index]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG]]
// CHECK: linalg.matmul
@@ -376,304 +265,209 @@
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable private @sort_op {
- hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>) {
- hal.executable.export public @sort_op layout(#pipeline_layout)
- builtin.module {
- func.func @sort_op() {
- %c1 = arith.constant 1 : index
- %c0 = arith.constant 0 : index
- %c2304000 = arith.constant 2304000 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:tensor<1x576000xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:tensor<1x576000xi32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<writeonly:tensor<1x576000xf32>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(32) offset(%c2304000) : !flow.dispatch.tensor<writeonly:tensor<1x576000xi32>>
- %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x576000xf32>> -> tensor<1x576000xf32>
- %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x576000xi32>> -> tensor<1x576000xi32>
- %9:2 = iree_linalg_ext.sort dimension(1) outs(%4, %5 : tensor<1x576000xf32>, tensor<1x576000xi32>) {
- ^bb0(%arg1: f32, %arg2: f32, %arg3: i32, %arg4: i32): // no predecessors
- %10 = arith.cmpf ogt, %arg1, %arg2 : f32
- iree_linalg_ext.yield %10 : i1
- } -> tensor<1x576000xf32>, tensor<1x576000xi32>
- flow.dispatch.tensor.store %9#0, %2, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1] : tensor<1x576000xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x576000xf32>>
- flow.dispatch.tensor.store %9#1, %3, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1] : tensor<1x576000xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x576000xi32>>
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
+module {
+ func.func @sort_op() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c1 = arith.constant 1 : index
+ %c0 = arith.constant 0 : index
+ %c2304000 = arith.constant 2304000 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:tensor<1x576000xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:tensor<1x576000xi32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<writeonly:tensor<1x576000xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(32) offset(%c2304000) : !flow.dispatch.tensor<writeonly:tensor<1x576000xi32>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x576000xf32>> -> tensor<1x576000xf32>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x576000xi32>> -> tensor<1x576000xi32>
+ %6:2 = iree_linalg_ext.sort dimension(1) outs(%4, %5 : tensor<1x576000xf32>, tensor<1x576000xi32>) {
+ ^bb0(%arg0: f32, %arg1: f32, %arg2: i32, %arg3: i32):
+ %7 = arith.cmpf ogt, %arg0, %arg1 : f32
+ iree_linalg_ext.yield %7 : i1
+ } -> tensor<1x576000xf32>, tensor<1x576000xi32>
+ flow.dispatch.tensor.store %6#0, %2, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1] : tensor<1x576000xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x576000xf32>>
+ flow.dispatch.tensor.store %6#1, %3, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1] : tensor<1x576000xi32> -> !flow.dispatch.tensor<writeonly:tensor<1x576000xi32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDistribute>
-// CHECK: hal.executable.export public @sort_op
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDistribute workgroup_size = [64, 1, 1]>
+// CHECK: func.func @sort_op()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: iree_linalg_ext.sort
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @user_config {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>) {
- hal.executable.export public @matmul_config_sm35 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_config_sm35() {
- %cst = arith.constant 0.000000e+00 : f32
- %c128 = arith.constant 128 : index
- %c1024 = arith.constant 1024 : index
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>> -> tensor<256x1024xf32>
- %15 = tensor.empty() : tensor<128x1024xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
- %17 = linalg.matmul
- ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%16 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
- return
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
+module {
+ func.func @matmul_config_sm35() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c128 = arith.constant 128 : index
+ %c1024 = arith.constant 1024 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>> -> tensor<256x1024xf32>
+ %5 = tensor.empty() : tensor<128x1024xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
+ return
}
}
-}
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-// CHECK: hal.executable.export public @matmul_config_sm35
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+// CHECK: func.func @matmul_config_sm35()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @user_config {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @matmul_config_sm80 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_config_sm80() {
- %cst = arith.constant 0.000000e+00 : f32
- %c128 = arith.constant 128 : index
- %c1024 = arith.constant 1024 : index
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>> -> tensor<256x1024xf32>
- %15 = tensor.empty() : tensor<128x1024xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
- %17 = linalg.matmul
- ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%16 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
- return
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @matmul_config_sm80() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c128 = arith.constant 128 : index
+ %c1024 = arith.constant 1024 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>> -> tensor<256x1024xf32>
+ %5 = tensor.empty() : tensor<128x1024xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
+ return
}
}
-}
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore
-// CHECK: hal.executable.export public @matmul_config_sm80
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync workgroup_size = [64, 2, 1] subgroup_size = 32
+// CHECK: func.func @matmul_config_sm80()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @user_config {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_86"}>) {
- hal.executable.export public @matmul_config_sm86 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_config_sm86() {
- %cst = arith.constant 0.000000e+00 : f32
- %c128 = arith.constant 128 : index
- %c1024 = arith.constant 1024 : index
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>> -> tensor<256x1024xf32>
- %15 = tensor.empty() : tensor<128x1024xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
- %17 = linalg.matmul
- ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%16 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
- return
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_86"}>
+module {
+ func.func @matmul_config_sm86() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c128 = arith.constant 128 : index
+ %c1024 = arith.constant 1024 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>> -> tensor<256x1024xf32>
+ %5 = tensor.empty() : tensor<128x1024xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
+ return
}
}
-}
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore
-// CHECK: hal.executable.export public @matmul_config_sm86
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync workgroup_size = [64, 2, 1] subgroup_size = 32
+// CHECK: func.func @matmul_config_sm86()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @contract_reduction {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_86"}>) {
- hal.executable.export public @contract_reduction layout(#pipeline_layout)
- builtin.module {
- func.func @contract_reduction() {
- %c0 = arith.constant 0 : index
- %c40064 = arith.constant 40064 : index
- %c34752 = arith.constant 34752 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3x7xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c40064) : !flow.dispatch.tensor<readonly:tensor<3x64x4x8xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c34752) : !flow.dispatch.tensor<writeonly:tensor<3x64xf32>>
- %3 = tensor.empty() : tensor<3x64xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 4], sizes = [3, 64, 4, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x64x4x8xf32>> -> tensor<3x64x4xf32>
- %5 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%3 : tensor<3x64xf32>) -> tensor<3x64xf32>
- %6 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x7xf32>> -> tensor<f32>
- %7 = linalg.generic {indexing_maps = [
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
- affine_map<(d0, d1, d2) -> ()>,
- affine_map<(d0, d1, d2) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel", "reduction"]}
- ins(%4, %6 : tensor<3x64x4xf32>, tensor<f32>) outs(%5 : tensor<3x64xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %8 = arith.subf %in, %in_0 : f32
- %9 = arith.maximumf %8, %cst : f32
- %10 = arith.mulf %9, %9 : f32
- %11 = arith.addf %out, %10 : f32
- linalg.yield %11 : f32
- } -> tensor<3x64xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [3, 64], strides = [1, 1] : tensor<3x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<3x64xf32>>
- return
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_86"}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> ()>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module {
+ func.func @contract_reduction() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %c40064 = arith.constant 40064 : index
+ %c34752 = arith.constant 34752 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3x7xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c40064) : !flow.dispatch.tensor<readonly:tensor<3x64x4x8xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c34752) : !flow.dispatch.tensor<writeonly:tensor<3x64xf32>>
+ %3 = tensor.empty() : tensor<3x64xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 4], sizes = [3, 64, 4, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x64x4x8xf32>> -> tensor<3x64x4xf32>
+ %5 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%3 : tensor<3x64xf32>) -> tensor<3x64xf32>
+ %6 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3x7xf32>> -> tensor<f32>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4, %6 : tensor<3x64x4xf32>, tensor<f32>) outs(%5 : tensor<3x64xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %8 = arith.subf %in, %in_0 : f32
+ %9 = arith.maximumf %8, %cst : f32
+ %10 = arith.mulf %9, %9 : f32
+ %11 = arith.addf %out, %10 : f32
+ linalg.yield %11 : f32
+ } -> tensor<3x64xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [3, 64], strides = [1, 1] : tensor<3x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<3x64xf32>>
+ return
}
-}
+
}
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorize
-// CHECK: hal.executable.export public @contract_reduction
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1] subgroup_size = 32
+// CHECK: func.func @contract_reduction()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @dynamic_pack_2x2 {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_86"}>) {
- hal.executable.export public @dynamic_pack_2x2 layout(#pipeline_layout)
- builtin.module {
- func.func @dynamic_pack_2x2() {
- %c0 = arith.constant 0 : index
- %c64 = arith.constant 64 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = arith.index_castui %0 : i32 to index
- %5 = arith.index_castui %1 : i32 to index
- %6 = arith.index_castui %2 : i32 to index
- %7 = arith.index_castui %3 : i32 to index
- %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c64) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%4, %5}
- %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x?x2x2xi32>>{%6, %7}
- %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%4, %5} -> tensor<?x?xi32>
- %11 = tensor.empty(%6, %7) : tensor<?x?x2x2xi32>
- %pack = tensor.pack %10 inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %11 : tensor<?x?xi32> -> tensor<?x?x2x2xi32>
- flow.dispatch.tensor.store %pack, %9, offsets = [0, 0, 0, 0], sizes = [%6, %7, 2, 2], strides = [1, 1, 1, 1] : tensor<?x?x2x2xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x2x2xi32>>{%6, %7}
- return
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_86"}>
+module {
+ func.func @dynamic_pack_2x2() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %c64 = arith.constant 64 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = arith.index_castui %0 : i32 to index
+ %5 = arith.index_castui %1 : i32 to index
+ %6 = arith.index_castui %2 : i32 to index
+ %7 = arith.index_castui %3 : i32 to index
+ %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c64) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%4, %5}
+ %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x?x2x2xi32>>{%6, %7}
+ %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xi32>>{%4, %5} -> tensor<?x?xi32>
+ %11 = tensor.empty(%6, %7) : tensor<?x?x2x2xi32>
+ %pack = tensor.pack %10 inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %11 : tensor<?x?xi32> -> tensor<?x?x2x2xi32>
+ flow.dispatch.tensor.store %pack, %9, offsets = [0, 0, 0, 0], sizes = [%6, %7, 2, 2], strides = [1, 1, 1, 1] : tensor<?x?x2x2xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x2x2xi32>>{%6, %7}
+ return
}
}
-}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 16]]>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUPackUnPack>
-// CHECK: hal.executable.export public @dynamic_pack_2x2
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUPackUnPack workgroup_size = [32, 1, 1]>
+// CHECK: func.func @dynamic_pack_2x2()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK: func.func @dynamic_pack_2x2
// CHECK: tensor.pack
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @user_config {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @large_matmul_f16 layout(#pipeline_layout)
- builtin.module {
- func.func @large_matmul_f16() {
- %cst = arith.constant 0.000000e+00 : f16
- %c128 = arith.constant 128 : index
- %c1024 = arith.constant 1024 : index
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<2560x1792xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1792x2048xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<2560x2048xf16>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2560, 1792], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<2560x1792xf16>> -> tensor<2560x1792xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1792, 2048], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1792x2048xf16>> -> tensor<1792x2048xf16>
- %15 = tensor.empty() : tensor<2560x2048xf16>
- %16 = linalg.fill ins(%cst : f16) outs(%15 : tensor<2560x2048xf16>) -> tensor<2560x2048xf16>
- %17 = linalg.matmul
- ins(%3, %4 : tensor<2560x1792xf16>, tensor<1792x2048xf16>) outs(%16 : tensor<2560x2048xf16>) -> tensor<2560x2048xf16>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [2560, 2048], strides = [1, 1] : tensor<2560x2048xf16> -> !flow.dispatch.tensor<writeonly:tensor<2560x2048xf16>>
- return
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @large_matmul_f16() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %cst = arith.constant 0.000000e+00 : f16
+ %c128 = arith.constant 128 : index
+ %c1024 = arith.constant 1024 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<2560x1792xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1792x2048xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<2560x2048xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2560, 1792], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2560x1792xf16>> -> tensor<2560x1792xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1792, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1792x2048xf16>> -> tensor<1792x2048xf16>
+ %5 = tensor.empty() : tensor<2560x2048xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2560x2048xf16>) -> tensor<2560x2048xf16>
+ %7 = linalg.matmul ins(%3, %4 : tensor<2560x1792xf16>, tensor<1792x2048xf16>) outs(%6 : tensor<2560x2048xf16>) -> tensor<2560x2048xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2560, 2048], strides = [1, 1] : tensor<2560x2048xf16> -> !flow.dispatch.tensor<writeonly:tensor<2560x2048xf16>>
+ return
}
}
-}
-
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128, 256, 32]{{\]}}
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync, {pipeline_depth = 3 : i64, store_stage = 1 : i64}>
-// CHECK: hal.executable.export public @large_matmul_f16
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync workgroup_size = [128, 2, 1] subgroup_size = 32, {pipeline_depth = 3 : i64, store_stage = 1 : i64}>
+// CHECK: func.func @large_matmul_f16()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [128 : index, 2 : index, 1 : index]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG]]
// CHECK: linalg.matmul
@@ -681,46 +475,30 @@
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @user_config {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @large_matmul_f32 layout(#pipeline_layout)
- builtin.module {
- func.func @large_matmul_f32() {
- %cst = arith.constant 0.000000e+00 : f32
- %c128 = arith.constant 128 : index
- %c1024 = arith.constant 1024 : index
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<2560x1792xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1792x2048xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<2560x2048xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2560, 1792], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<2560x1792xf32>> -> tensor<2560x1792xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1792, 2048], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1792x2048xf32>> -> tensor<1792x2048xf32>
- %15 = tensor.empty() : tensor<2560x2048xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<2560x2048xf32>) -> tensor<2560x2048xf32>
- %17 = linalg.matmul
- ins(%3, %4 : tensor<2560x1792xf32>, tensor<1792x2048xf32>) outs(%16 : tensor<2560x2048xf32>) -> tensor<2560x2048xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [2560, 2048], strides = [1, 1] : tensor<2560x2048xf32> -> !flow.dispatch.tensor<writeonly:tensor<2560x2048xf32>>
- return
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @large_matmul_f32() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c128 = arith.constant 128 : index
+ %c1024 = arith.constant 1024 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<2560x1792xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1792x2048xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<2560x2048xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2560, 1792], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2560x1792xf32>> -> tensor<2560x1792xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1792, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1792x2048xf32>> -> tensor<1792x2048xf32>
+ %5 = tensor.empty() : tensor<2560x2048xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2560x2048xf32>) -> tensor<2560x2048xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<2560x1792xf32>, tensor<1792x2048xf32>) outs(%6 : tensor<2560x2048xf32>) -> tensor<2560x2048xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2560, 2048], strides = [1, 1] : tensor<2560x2048xf32> -> !flow.dispatch.tensor<writeonly:tensor<2560x2048xf32>>
+ return
}
}
-}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128, 256, 16]{{\]}}
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>
-// CHECK: hal.executable.export public @large_matmul_f32
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync workgroup_size = [128, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>
+// CHECK: func.func @large_matmul_f32()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [128 : index, 2 : index, 1 : index]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG]]
// CHECK: linalg.matmul
@@ -728,159 +506,124 @@
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @inner_unit_dim {
- hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @inner_unit_dim layout(#pipeline_layout)
- builtin.module {
- func.func @inner_unit_dim() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<16384x1xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<16384x1xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<16384x1xf32>>
- %3 = tensor.empty() : tensor<16384x1xf32>
- %4 = flow.dispatch.tensor.load %0, offsets=[0, 0], sizes=[16384, 1], strides=[1, 1] : !flow.dispatch.tensor<readonly:tensor<16384x1xf32>> -> tensor<16384x1xf32>
- %5 = flow.dispatch.tensor.load %1, offsets=[0, 0], sizes=[16384, 1], strides=[1, 1] : !flow.dispatch.tensor<readonly:tensor<16384x1xf32>> -> tensor<16384x1xf32>
- %6 = linalg.generic
- {indexing_maps =
- [affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%4, %5 : tensor<16384x1xf32>, tensor<16384x1xf32>) outs(%3 : tensor<16384x1xf32>) {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
- %7 = arith.addf %arg0, %arg1 : f32
- linalg.yield %7 : f32
- } -> tensor<16384x1xf32>
- flow.dispatch.tensor.store %6, %2, offsets=[0, 0], sizes=[16384, 1], strides=[1, 1] : tensor<16384x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<16384x1xf32>>
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @inner_unit_dim() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<16384x1xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<16384x1xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<16384x1xf32>>
+ %3 = tensor.empty() : tensor<16384x1xf32>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [16384, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<16384x1xf32>> -> tensor<16384x1xf32>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16384, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<16384x1xf32>> -> tensor<16384x1xf32>
+ %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%4, %5 : tensor<16384x1xf32>, tensor<16384x1xf32>) outs(%3 : tensor<16384x1xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %7 = arith.addf %in, %in_0 : f32
+ linalg.yield %7 : f32
+ } -> tensor<16384x1xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [16384, 1], strides = [1, 1] : tensor<16384x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<16384x1xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorize>
-// CHECK: hal.executable.export public @inner_unit_dim
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1] subgroup_size = 32>
+// CHECK: func.func @inner_unit_dim()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
-// CHECK: func.func @inner_unit_dim
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @forward_dispatch_1_conv_2d_nhwc_hwcf_256x112x112x64x7x7x3_f32 {
- hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @forward_dispatch_1_conv_2d_nhwc_hwcf_256x112x112x64x7x7x3_f32 layout(#pipeline_layout)
- builtin.module {
- func.func @forward_dispatch_1_conv_2d_nhwc_hwcf_256x112x112x64x7x7x3_f32() {
- %c0 = arith.constant 0 : index
- %c162508800 = arith.constant 162508800 : index
- %cst = arith.constant 1.001000e-05 : f32
- %cst_0 = arith.constant 0.000000e+00 : f32
- %cst_1 = arith.constant dense_resource<__elided__> : tensor<64xf32>
- %cst_2 = arith.constant dense_resource<__elided__> : tensor<64xf32>
- %cst_3 = arith.constant dense_resource<__elided__> : tensor<64xf32>
- %cst_4 = arith.constant dense_resource<__elided__> : tensor<64xf32>
- %cst_5 = arith.constant dense_resource<__elided__> : tensor<64xf32>
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x230x230x3xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<7x7x3x64xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c162508800) : !flow.dispatch.tensor<writeonly:tensor<256x112x112x64xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [256, 230, 230, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<256x230x230x3xf32>> -> tensor<256x230x230x3xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [7, 7, 3, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<7x7x3x64xf32>> -> tensor<7x7x3x64xf32>
- %5 = tensor.empty() : tensor<256x112x112x64xf32>
- %6 = linalg.fill ins(%cst_0 : f32) outs(%5 : tensor<256x112x112x64xf32>) -> tensor<256x112x112x64xf32>
- %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<256x230x230x3xf32>, tensor<7x7x3x64xf32>) outs(%6 : tensor<256x112x112x64xf32>) -> tensor<256x112x112x64xf32>
- %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7, %cst_1, %cst_2, %cst_3, %cst_4, %cst_5 : tensor<256x112x112x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) outs(%5 : tensor<256x112x112x64xf32>) {
- ^bb0(%in: f32, %in_6: f32, %in_7: f32, %in_8: f32, %in_9: f32, %in_10: f32, %out: f32):
- %9 = arith.addf %in_9, %cst : f32
- %10 = math.sqrt %9 : f32
- %11 = arith.addf %in, %in_6 : f32
- %12 = arith.subf %11, %in_7 : f32
- %13 = arith.mulf %12, %in_8 : f32
- %14 = arith.divf %13, %10 : f32
- %15 = arith.addf %14, %in_10 : f32
- %16 = arith.maximumf %15, %cst_0 : f32
- linalg.yield %16 : f32
- } -> tensor<256x112x112x64xf32>
- flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0, 0], sizes = [256, 112, 112, 64], strides = [1, 1, 1, 1] : tensor<256x112x112x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x112x112x64xf32>>
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d3)>
+module {
+ func.func @forward_dispatch_1_conv_2d_nhwc_hwcf_256x112x112x64x7x7x3_f32() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %c162508800 = arith.constant 162508800 : index
+ %cst = arith.constant 1.001000e-05 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %cst_1 = arith.constant dense_resource<__elided__> : tensor<64xf32>
+ %cst_2 = arith.constant dense_resource<__elided__> : tensor<64xf32>
+ %cst_3 = arith.constant dense_resource<__elided__> : tensor<64xf32>
+ %cst_4 = arith.constant dense_resource<__elided__> : tensor<64xf32>
+ %cst_5 = arith.constant dense_resource<__elided__> : tensor<64xf32>
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x230x230x3xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<7x7x3x64xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c162508800) : !flow.dispatch.tensor<writeonly:tensor<256x112x112x64xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [256, 230, 230, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<256x230x230x3xf32>> -> tensor<256x230x230x3xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [7, 7, 3, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<7x7x3x64xf32>> -> tensor<7x7x3x64xf32>
+ %5 = tensor.empty() : tensor<256x112x112x64xf32>
+ %6 = linalg.fill ins(%cst_0 : f32) outs(%5 : tensor<256x112x112x64xf32>) -> tensor<256x112x112x64xf32>
+ %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<256x230x230x3xf32>, tensor<7x7x3x64xf32>) outs(%6 : tensor<256x112x112x64xf32>) -> tensor<256x112x112x64xf32>
+ %8 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map1, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7, %cst_1, %cst_2, %cst_3, %cst_4, %cst_5 : tensor<256x112x112x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) outs(%5 : tensor<256x112x112x64xf32>) {
+ ^bb0(%in: f32, %in_6: f32, %in_7: f32, %in_8: f32, %in_9: f32, %in_10: f32, %out: f32):
+ %9 = arith.addf %in_9, %cst : f32
+ %10 = math.sqrt %9 : f32
+ %11 = arith.addf %in, %in_6 : f32
+ %12 = arith.subf %11, %in_7 : f32
+ %13 = arith.mulf %12, %in_8 : f32
+ %14 = arith.divf %13, %10 : f32
+ %15 = arith.addf %14, %in_10 : f32
+ %16 = arith.maximumf %15, %cst_0 : f32
+ linalg.yield %16 : f32
+ } -> tensor<256x112x112x64xf32>
+ flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0, 0], sizes = [256, 112, 112, 64], strides = [1, 1, 1, 1] : tensor<256x112x112x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x112x112x64xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 8, 64, 1, 1, 4], [0, 1, 0, 0]{{\]}}
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorize>
-// CHECK: hal.executable.export public @forward_dispatch_1_conv_2d_nhwc_hwcf_256x112x112x64x7x7x3_f32
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [16 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [16, 2, 1]>
// CHECK: func.func @forward_dispatch_1_conv_2d_nhwc_hwcf_256x112x112x64x7x7x3_f32
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-
-hal.executable public @_main_dispatch_15 {
- hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>) {
- hal.executable.export public @_main_dispatch_15_generic_512x4x42x42x64_f32 ordinal(0) layout(#hal.pipeline.layout<push_constants = 3, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @_main_dispatch_15_generic_512x4x42x42x64_f32() {
- %cst = arith.constant 1.250000e-01 : f32
- %cst_0 = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = arith.index_castui %0 {stream.alignment = 64 : index, stream.values = [35524672 : index, 240930880 : index, 446337088 : index, 651743296 : index]} : i32 to index
- %4 = arith.index_castui %1 {stream.alignment = 64 : index, stream.values = [57544768 : index, 262950976 : index, 468357184 : index, 673763392 : index]} : i32 to index
- %5 = arith.index_castui %2 {stream.alignment = 64 : index, stream.values = [1728 : index, 36472832 : index, 72943744 : index, 109415936 : index]} : i32 to index
- %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x42x4x64xf32>>
- %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x42x4x64xf32>>
- %8 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : !flow.dispatch.tensor<writeonly:tensor<512x4x42x42xf32>>
- %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [512, 42, 4, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<512x42x4x64xf32>> -> tensor<512x42x4x64xf32>
- %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [512, 42, 4, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<512x42x4x64xf32>> -> tensor<512x42x4x64xf32>
- %11 = tensor.empty() : tensor<512x4x42x42xf32>
- %12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<512x4x42x42xf32>) -> tensor<512x4x42x42xf32>
- %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d1, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d1, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%9, %10 : tensor<512x42x4x64xf32>, tensor<512x42x4x64xf32>) outs(%12 : tensor<512x4x42x42xf32>) {
- ^bb0(%in: f32, %in_1: f32, %out: f32):
- %15 = arith.mulf %in, %in_1 : f32
- %16 = arith.addf %out, %15 : f32
- linalg.yield %16 : f32
- } -> tensor<512x4x42x42xf32>
- %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<512x4x42x42xf32>) outs(%11 : tensor<512x4x42x42xf32>) {
- ^bb0(%in: f32, %out: f32):
- %15 = arith.mulf %in, %cst : f32
- linalg.yield %15 : f32
- } -> tensor<512x4x42x42xf32>
- flow.dispatch.tensor.store %14, %8, offsets = [0, 0, 0, 0], sizes = [512, 4, 42, 42], strides = [1, 1, 1, 1] : tensor<512x4x42x42xf32> -> !flow.dispatch.tensor<writeonly:tensor<512x4x42x42xf32>>
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
+#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d1, d4)>
+#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d1, d4)>
+#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+module {
+ func.func @_main_dispatch_15_generic_512x4x42x42x64_f32() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %cst = arith.constant 1.250000e-01 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = arith.index_castui %0 {stream.alignment = 64 : index, stream.values = [35524672 : index, 240930880 : index, 446337088 : index, 651743296 : index]} : i32 to index
+ %4 = arith.index_castui %1 {stream.alignment = 64 : index, stream.values = [57544768 : index, 262950976 : index, 468357184 : index, 673763392 : index]} : i32 to index
+ %5 = arith.index_castui %2 {stream.alignment = 64 : index, stream.values = [1728 : index, 36472832 : index, 72943744 : index, 109415936 : index]} : i32 to index
+ %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x42x4x64xf32>>
+ %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x42x4x64xf32>>
+ %8 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : !flow.dispatch.tensor<writeonly:tensor<512x4x42x42xf32>>
+ %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [512, 42, 4, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<512x42x4x64xf32>> -> tensor<512x42x4x64xf32>
+ %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [512, 42, 4, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<512x42x4x64xf32>> -> tensor<512x42x4x64xf32>
+ %11 = tensor.empty() : tensor<512x4x42x42xf32>
+ %12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<512x4x42x42xf32>) -> tensor<512x4x42x42xf32>
+ %13 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%9, %10 : tensor<512x42x4x64xf32>, tensor<512x42x4x64xf32>) outs(%12 : tensor<512x4x42x42xf32>) {
+ ^bb0(%in: f32, %in_1: f32, %out: f32):
+ %15 = arith.mulf %in, %in_1 : f32
+ %16 = arith.addf %out, %15 : f32
+ linalg.yield %16 : f32
+ } -> tensor<512x4x42x42xf32>
+ %14 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<512x4x42x42xf32>) outs(%11 : tensor<512x4x42x42xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %15 = arith.mulf %in, %cst : f32
+ linalg.yield %15 : f32
+ } -> tensor<512x4x42x42xf32>
+ flow.dispatch.tensor.store %14, %8, offsets = [0, 0, 0, 0], sizes = [512, 4, 42, 42], strides = [1, 1, 1, 1] : tensor<512x4x42x42xf32> -> !flow.dispatch.tensor<writeonly:tensor<512x4x42x42xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 32, 128, 32]{{\]}}
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-// CHECK: hal.executable.export public @_main_dispatch_15_generic_512x4x42x42x64_f32
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+// CHECK: func.func @_main_dispatch_15_generic_512x4x42x42x64_f32()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -888,107 +631,78 @@
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer, ReadOnly>,
- #hal.descriptor_set.binding<1, storage_buffer, ReadOnly>,
- #hal.descriptor_set.binding<2, storage_buffer, ReadOnly>,
- #hal.descriptor_set.binding<3, storage_buffer, ReadOnly>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-
-hal.executable public @i4_dequant_matvec {
- hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>) {
- hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @i4_dequant_matvec() {
- %c32_i64 = arith.constant 32 : i64
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = hal.interface.constant.load[4] : i32
- %5 = hal.interface.constant.load[5] : i32
- %6 = hal.interface.constant.load[6] : i32
- %7 = hal.interface.constant.load[7] : i32
- %8 = hal.interface.constant.load[8] : i32
- %9 = arith.index_castui %0 : i32 to index
- %10 = arith.index_castui %1 : i32 to index
- %11 = arith.index_castui %2 : i32 to index
- %12 = arith.extui %3 : i32 to i64
- %13 = arith.extui %4 : i32 to i64
- %14 = arith.shli %13, %c32_i64 : i64
- %15 = arith.ori %12, %14 : i64
- %16 = arith.index_castui %15 : i64 to index
- %17 = arith.extui %5 : i32 to i64
- %18 = arith.extui %6 : i32 to i64
- %19 = arith.shli %18, %c32_i64 : i64
- %20 = arith.ori %17, %19 : i64
- %21 = arith.index_castui %20 : i64 to index
- %22 = arith.extui %7 : i32 to i64
- %23 = arith.extui %8 : i32 to i64
- %24 = arith.shli %23, %c32_i64 : i64
- %25 = arith.ori %22, %24 : i64
- %26 = arith.index_castui %25 : i64 to index
- %27 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x11008xi4>>
- %28 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%10) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096xf32>>
- %29 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%11) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096xf32>>
- %30 = flow.dispatch.workload.ordinal %26, 0 : index
- %31 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%16) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x11008xf32>>{%30}
- %32 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%21) : !flow.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%30}
- %33 = flow.dispatch.tensor.load %27, offsets = [0, 0], sizes = [4096, 11008], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x11008xi4>> -> tensor<4096x11008xi4>
- %34 = flow.dispatch.tensor.load %28, offsets = [0], sizes = [4096], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4096xf32>> -> tensor<4096xf32>
- %35 = flow.dispatch.tensor.load %29, offsets = [0], sizes = [4096], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4096xf32>> -> tensor<4096xf32>
- %36 = flow.dispatch.tensor.load %31, offsets = [0, 0], sizes = [%30, 11008], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x11008xf32>>{%30} -> tensor<?x11008xf32>
- %37 = tensor.empty(%30) : tensor<?x4096xf32>
- %38 = tensor.empty() : tensor<4096x11008xf32>
- %39 = linalg.fill ins(%cst : f32) outs(%37 : tensor<?x4096xf32>) -> tensor<?x4096xf32>
- %40 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1) -> (d0, d1)>,
- affine_map<(d0, d1) -> (d0)>,
- affine_map<(d0, d1) -> (d0)>,
- affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%33, %34, %35 : tensor<4096x11008xi4>, tensor<4096xf32>, tensor<4096xf32>) outs(%38 : tensor<4096x11008xf32>) {
- ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
- %42 = arith.extui %in : i4 to i32
- %43 = arith.uitofp %42 : i32 to f32
- %44 = arith.subf %43, %in_1 : f32
- %45 = arith.mulf %44, %in_0 : f32
- linalg.yield %45 : f32
- } -> tensor<4096x11008xf32>
- %41 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2) -> (d0, d2)>,
- affine_map<(d0, d1, d2) -> (d1, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel", "reduction"]}
- ins(%36, %40 : tensor<?x11008xf32>, tensor<4096x11008xf32>) outs(%39 : tensor<?x4096xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %42 = arith.mulf %in, %in_0 : f32
- %43 = arith.addf %42, %out : f32
- linalg.yield %43 : f32
- } -> tensor<?x4096xf32>
- flow.dispatch.tensor.store %41, %32, offsets = [0, 0], sizes = [%30, 4096], strides = [1, 1] : tensor<?x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%30}
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map4 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module {
+ func.func @i4_dequant_matvec() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c32_i64 = arith.constant 32 : i64
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = hal.interface.constant.load[4] : i32
+ %5 = hal.interface.constant.load[5] : i32
+ %6 = hal.interface.constant.load[6] : i32
+ %7 = hal.interface.constant.load[7] : i32
+ %8 = hal.interface.constant.load[8] : i32
+ %9 = arith.index_castui %0 : i32 to index
+ %10 = arith.index_castui %1 : i32 to index
+ %11 = arith.index_castui %2 : i32 to index
+ %12 = arith.extui %3 : i32 to i64
+ %13 = arith.extui %4 : i32 to i64
+ %14 = arith.shli %13, %c32_i64 : i64
+ %15 = arith.ori %12, %14 : i64
+ %16 = arith.index_castui %15 : i64 to index
+ %17 = arith.extui %5 : i32 to i64
+ %18 = arith.extui %6 : i32 to i64
+ %19 = arith.shli %18, %c32_i64 : i64
+ %20 = arith.ori %17, %19 : i64
+ %21 = arith.index_castui %20 : i64 to index
+ %22 = arith.extui %7 : i32 to i64
+ %23 = arith.extui %8 : i32 to i64
+ %24 = arith.shli %23, %c32_i64 : i64
+ %25 = arith.ori %22, %24 : i64
+ %26 = arith.index_castui %25 : i64 to index
+ %27 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x11008xi4>>
+ %28 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%10) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096xf32>>
+ %29 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%11) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096xf32>>
+ %30 = flow.dispatch.workload.ordinal %26, 0 : index
+ %31 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%16) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x11008xf32>>{%30}
+ %32 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%21) : !flow.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%30}
+ %33 = flow.dispatch.tensor.load %27, offsets = [0, 0], sizes = [4096, 11008], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x11008xi4>> -> tensor<4096x11008xi4>
+ %34 = flow.dispatch.tensor.load %28, offsets = [0], sizes = [4096], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4096xf32>> -> tensor<4096xf32>
+ %35 = flow.dispatch.tensor.load %29, offsets = [0], sizes = [4096], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4096xf32>> -> tensor<4096xf32>
+ %36 = flow.dispatch.tensor.load %31, offsets = [0, 0], sizes = [%30, 11008], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x11008xf32>>{%30} -> tensor<?x11008xf32>
+ %37 = tensor.empty(%30) : tensor<?x4096xf32>
+ %38 = tensor.empty() : tensor<4096x11008xf32>
+ %39 = linalg.fill ins(%cst : f32) outs(%37 : tensor<?x4096xf32>) -> tensor<?x4096xf32>
+ %40 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%33, %34, %35 : tensor<4096x11008xi4>, tensor<4096xf32>, tensor<4096xf32>) outs(%38 : tensor<4096x11008xf32>) {
+ ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
+ %42 = arith.extui %in : i4 to i32
+ %43 = arith.uitofp %42 : i32 to f32
+ %44 = arith.subf %43, %in_1 : f32
+ %45 = arith.mulf %44, %in_0 : f32
+ linalg.yield %45 : f32
+ } -> tensor<4096x11008xf32>
+ %41 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%36, %40 : tensor<?x11008xf32>, tensor<4096x11008xf32>) outs(%39 : tensor<?x4096xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %42 = arith.mulf %in, %in_0 : f32
+ %43 = arith.addf %42, %out : f32
+ linalg.yield %43 : f32
+ } -> tensor<?x4096xf32>
+ flow.dispatch.tensor.store %41, %32, offsets = [0, 0], sizes = [%30, 4096], strides = [1, 1] : tensor<?x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%30}
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 256]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @i4_dequant_matvec
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 32>
+// CHECK-LABEL: func.func @i4_dequant_matvec()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
-// CHECK: func.func @i4_dequant_matvec()
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[$CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
index 011994c..793acdb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
@@ -1,311 +1,168 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy)))" --verify-diagnostics --split-input-file %s
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" --verify-diagnostics --split-input-file %s
#config = #iree_codegen.lowering_config<tile_sizes = []>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [32 : index, 8 : index, 8 : index]
- }
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
- // expected-error @+1 {{Total number of threads in a thread block 2048 exceeds the limit of 1024 with compilation pipeline LLVMGPUMatmulSimt}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
- outs(%result: memref<4x16xf32>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [32, 8, 8], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
+ // expected-error @+1 {{Total number of threads in a thread block 2048 exceeds the limit of 1024 with compilation pipeline LLVMGPUMatmulSimt}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>)
+ return
}
}
// -----
#config = #iree_codegen.lowering_config<tile_sizes = []>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [32 : index, 8 : index, 2 : index]
- }
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
- // expected-error @+1 {{Expected workgroup size in z-dim = 1, but got 2 with compilation pipeline LLVMGPUMatmulSimt}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
- outs(%result: memref<4x16xf32>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [32, 8, 2], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
+ // expected-error @+1 {{Expected workgroup size in z-dim = 1, but got 2 with compilation pipeline LLVMGPUMatmulSimt}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>)
+ return
}
}
// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [64 : index, 2 : index, 10 : index]
- }
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x32xf32>
- // expected-error @+1 {{Total number of threads in a thread block 1280 exceeds the limit of 1024 with compilation pipeline LLVMGPUMatmulTensorCore}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<32x16xf32>, memref<16x32xf32>)
- outs(%result: memref<32x32xf32>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 10], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x32xf32>
+ // expected-error @+1 {{Total number of threads in a thread block 1280 exceeds the limit of 1024 with compilation pipeline LLVMGPUMatmulTensorCore}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x32xf32>) outs(%2 : memref<32x32xf32>)
+ return
}
}
// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [48 : index, 2 : index, 1 : index]
- }
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x32xf32>
- // expected-error @+1 {{Number of threads in x-dim 48 is not a multiple of warp size (32) or integer units of warps in x-dim with compilation pipeline LLVMGPUMatmulTensorCore}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<32x16xf32>, memref<16x32xf32>)
- outs(%result: memref<32x32xf32>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [48, 2, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x32xf32>
+ // expected-error @+1 {{Number of threads in x-dim 48 is not a multiple of warp size (32) or integer units of warps in x-dim with compilation pipeline LLVMGPUMatmulTensorCore}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x32xf32>) outs(%2 : memref<32x32xf32>)
+ return
}
}
// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [64 : index, 2 : index, 2 : index]
- }
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x32xf32>
- // expected-error @+1 {{Expected workgroup size in z-dim = 1, but got 2 with compilation pipeline LLVMGPUMatmulTensorCore}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<32x16xf32>, memref<16x32xf32>)
- outs(%result: memref<32x32xf32>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 2], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x32xf32>
+ // expected-error @+1 {{Expected workgroup size in z-dim = 1, but got 2 with compilation pipeline LLVMGPUMatmulTensorCore}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x32xf32>) outs(%2 : memref<32x32xf32>)
+ return
}
}
// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 20]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [64 : index, 2 : index, 1 : index]
- }
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x32xf32>
- // expected-error @+1 {{Thread block shape 32, 32, 20 cannot be tiled on matmul shape 32, 32, 16 with compilation pipeline LLVMGPUMatmulTensorCore}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<32x16xf32>, memref<16x32xf32>)
- outs(%result: memref<32x32xf32>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x32xf32>
+ // expected-error @+1 {{Thread block shape 32, 32, 20 cannot be tiled on matmul shape 32, 32, 16 with compilation pipeline LLVMGPUMatmulTensorCore}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x32xf32>) outs(%2 : memref<32x32xf32>)
+ return
}
}
// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[64, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [128 : index, 1 : index, 1 : index]
- }
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xf32>
- // expected-error @+1 {{Tensor Core instruction shape 16, 16, 8 cannot be tiled on warp shape 64, 8, 16 with compilation pipeline LLVMGPUMatmulTensorCore}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<1024x512xf32>, memref<512x256xf32>)
- outs(%result: memref<1024x256xf32>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [128, 1, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xf32>
+ // expected-error @+1 {{Tensor Core instruction shape 16, 16, 8 cannot be tiled on warp shape 64, 8, 16 with compilation pipeline LLVMGPUMatmulTensorCore}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<1024x512xf32>, memref<512x256xf32>) outs(%2 : memref<1024x256xf32>)
+ return
}
}
// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [64 : index, 2 : index, 1 : index]
- }
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<48x16xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<48x32xf32>
- // expected-error @+1 {{Thread block shape 32, 32, 16 cannot be tiled on matmul shape 48, 32, 16 with compilation pipeline LLVMGPUMatmulTensorCore}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<48x16xf32>, memref<16x32xf32>)
- outs(%result: memref<48x32xf32>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<48x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<48x32xf32>
+ // expected-error @+1 {{Thread block shape 32, 32, 16 cannot be tiled on matmul shape 48, 32, 16 with compilation pipeline LLVMGPUMatmulTensorCore}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<48x16xf32>, memref<16x32xf32>) outs(%2 : memref<48x32xf32>)
+ return
}
}
// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [64 : index, 2 : index, 1 : index]
- }
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x48xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x48xf32>
- // expected-error @+1 {{Thread block shape 32, 32, 16 cannot be tiled on matmul shape 32, 48, 16 with compilation pipeline LLVMGPUMatmulTensorCore}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<32x16xf32>, memref<16x48xf32>)
- outs(%result: memref<32x48xf32>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x48xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x48xf32>
+ // expected-error @+1 {{Thread block shape 32, 32, 16 cannot be tiled on matmul shape 32, 48, 16 with compilation pipeline LLVMGPUMatmulTensorCore}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x48xf32>) outs(%2 : memref<32x48xf32>)
+ return
}
}
// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[2, 32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @batch_matmul_func {
- hal.executable.variant @cuda target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [64 : index, 2 : index, 1 : index]
- }
-builtin.module {
- func.func @illegal() {
+#map = affine_map<()[s0] -> (s0 * 8)>
+#map1 = affine_map<()[s0] -> (s0 * 32)>
+#map2 = affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>
+#map3 = affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>
+#map4 = affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c4 = arith.constant 4 : index
@@ -324,122 +181,72 @@
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c4 step %workgroup_count_z {
- %3 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_y]
- %4 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_y]
+ %3 = affine.apply #map()[%workgroup_id_y]
+ %4 = affine.apply #map()[%workgroup_count_y]
scf.for %arg1 = %3 to %c32 step %4 {
- %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
- %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
+ %5 = affine.apply #map1()[%workgroup_id_x]
+ %6 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg2 = %5 to %c64 step %6 {
- %7 = memref.subview %0[%arg0, %arg1, 0] [1, 8, 1024] [1, 1, 1] : memref<4x32x1024xf32> to memref<1x8x1024xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>>
- %8 = memref.subview %1[%arg0, 0, %arg2] [1, 1024, 32] [1, 1, 1] : memref<4x1024x64xf32> to memref<1x1024x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>>
- %9 = memref.subview %2[%arg0, %arg1, %arg2] [1, 8, 32] [1, 1, 1] : memref<4x32x64xf32> to memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>
- linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%9 : memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>)
+ %subview = memref.subview %0[%arg0, %arg1, 0] [1, 8, 1024] [1, 1, 1] : memref<4x32x1024xf32> to memref<1x8x1024xf32, #map2>
+ %subview_0 = memref.subview %1[%arg0, 0, %arg2] [1, 1024, 32] [1, 1, 1] : memref<4x1024x64xf32> to memref<1x1024x32xf32, #map3>
+ %subview_1 = memref.subview %2[%arg0, %arg1, %arg2] [1, 8, 32] [1, 1, 1] : memref<4x32x64xf32> to memref<1x8x32xf32, #map4>
+ linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%subview_1 : memref<1x8x32xf32, #map4>)
// expected-error @+1 {{Received batch tile dimension of 2 instead of 0 for non-partitionable loops with compilation pipeline LLVMGPUMatmulTensorCore}}
- linalg.batch_matmul {lowering_config = #config} ins(%7, %8 : memref<1x8x1024xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>>, memref<1x1024x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>>) outs(%9 : memref<1x8x32xf32, affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>>)
+ linalg.batch_matmul {lowering_config = #config} ins(%subview, %subview_0 : memref<1x8x1024xf32, #map2>, memref<1x1024x32xf32, #map3>) outs(%subview_1 : memref<1x8x32xf32, #map4>)
}
}
}
return
}
}
-}
-}
// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[64, 32, 48]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [128 : index, 1 : index, 1 : index]
- }
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xf32>
- // expected-error @+1 {{Thread block shape 64, 32, 48 cannot be tiled on matmul shape 1024, 256, 512 with compilation pipeline LLVMGPUMatmulTensorCoreMmaSync}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<1024x512xf32>, memref<512x256xf32>)
- outs(%result: memref<1024x256xf32>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync workgroup_size = [128, 1, 1], {pipeline_depth = 4 : i64, store_stage = 1 : i64}>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xf32>
+ // expected-error @+1 {{Thread block shape 64, 32, 48 cannot be tiled on matmul shape 1024, 256, 512 with compilation pipeline LLVMGPUMatmulTensorCoreMmaSync}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<1024x512xf32>, memref<512x256xf32>) outs(%2 : memref<1024x256xf32>)
+ return
}
}
// -----
-
#config = #iree_codegen.lowering_config<tile_sizes = [[64, 32, 4]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [128 : index, 1 : index, 1 : index]
- }
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xf32>
- // expected-error @+1 {{Tensor Core instruction shape 16, 8, 8 cannot be tiled on warp shape 64, 8, 4 with compilation pipeline LLVMGPUMatmulTensorCoreMmaSync}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<1024x512xf32>, memref<512x256xf32>)
- outs(%result: memref<1024x256xf32>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync workgroup_size = [128, 1, 1], {pipeline_depth = 4 : i64, store_stage = 1 : i64}>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xf32>
+ // expected-error @+1 {{Tensor Core instruction shape 16, 8, 8 cannot be tiled on warp shape 64, 8, 4 with compilation pipeline LLVMGPUMatmulTensorCoreMmaSync}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<1024x512xf32>, memref<512x256xf32>) outs(%2 : memref<1024x256xf32>)
+ return
}
}
// -----
#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant @cuda target(#hal.executable.target<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @illegal layout(#pipeline_layout) attributes {
- translation_info = #translation,
- workgroup_size = [128 : index, 1 : index, 1 : index]
- }
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xi8>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xi8>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xi8>
- // expected-error @+1 {{Expected f16, bf16 or f32 for Tensor Core (MMA.SYNC) pipeline}}
- linalg.matmul {lowering_config = #config} ins(%lhs, %rhs : memref<1024x512xi8>, memref<512x256xi8>)
- outs(%result: memref<1024x256xi8>)
- return
- }
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync workgroup_size = [128, 1, 1], {pipeline_depth = 4 : i64, store_stage = 1 : i64}>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xi8>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xi8>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xi8>
+ // expected-error @+1 {{Expected f16, bf16 or f32 for Tensor Core (MMA.SYNC) pipeline}}
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<1024x512xi8>, memref<512x256xi8>) outs(%2 : memref<1024x256xi8>)
+ return
}
}
-
-// -----
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
index 17f3a6f..af87aff 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
@@ -1,34 +1,25 @@
-// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-llvmgpu-configuration-pipeline, iree-llvmgpu-lower-executable-target)))" \
+// RUN: iree-opt %s --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline, func.func(iree-llvmgpu-lower-executable-target))" \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
// RUN: --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_bufferize_spec.mlir@__transform_main | \
// RUN: FileCheck %s
-// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-llvmgpu-configuration-pipeline, iree-llvmgpu-lower-executable-target)))" \
+// RUN: iree-opt %s --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline, func.func(iree-llvmgpu-lower-executable-target))" \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
// RUN: --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_foreach_to_gpu_spec.mlir@__transform_main | \
// RUN: FileCheck %s --check-prefix=FOREACH-TO-GPU
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>]>]>
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
-hal.executable private @matmul_static_dispatch_0 {
- hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export public @matmul_static_dispatch_0 ordinal(0) layout(#pipeline_layout){
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @matmul_static_dispatch_0() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<250x500xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<500x1020xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<250x1020xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [250, 500], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<250x500xf32>> -> tensor<250x500xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [500, 1020], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<500x1020xf32>> -> tensor<500x1020xf32>
-
- %50 = tensor.empty() : tensor<250x1020xf32>
- %cst = arith.constant 0.000000e+00 : f32
- %5 = linalg.fill ins(%cst : f32) outs(%50 : tensor<250x1020xf32>) -> tensor<250x1020xf32>
+module {
+ func.func @matmul_static_dispatch_0() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<250x500xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<500x1020xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<250x1020xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [250, 500], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<250x500xf32>> -> tensor<250x500xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [500, 1020], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<500x1020xf32>> -> tensor<500x1020xf32>
+ %5 = tensor.empty() : tensor<250x1020xf32>
+ %cst = arith.constant 0.000000e+00 : f32
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<250x1020xf32>) -> tensor<250x1020xf32>
// CHECK: memref.assume_alignment %{{.*}}, 64 : memref<250x1020xf32, #hal.descriptor_type<storage_buffer>>
// CHECK-NEXT: linalg.fill ins(%{{.*}} : f32) outs(%{{.*}} : memref<250x1020xf32, #hal.descriptor_type<storage_buffer>>)
@@ -36,7 +27,9 @@
// CHECK-NEXT: return
// workgroup_size is explicitly set to [10, 11].
- // FOREACH-TO-GPU-DAG: hal.executable.export {{.*}}{subgroup_size = 32 : index, translation_info = #translation, workgroup_size = [10 : index, 11 : index, 1 : index]}
+ // FOREACH-TO-GPU: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<None workgroup_size = [10, 11, 1] subgroup_size = 32>
+ // FOREACH-TO-GPU: func.func @matmul_static_dispatch_0()
+ // FOREACH-TO-GPU-SAME: translation_info = #translation
// FOREACH-TO-GPU-DAG: %[[C0:.*]] = arith.constant 0 : index
// FOREACH-TO-GPU-DAG: %[[C1:.*]] = arith.constant 1 : index
// FOREACH-TO-GPU-DAG: %[[C5:.*]] = arith.constant 5 : index
@@ -73,10 +66,10 @@
// FOREACH-TO-GPU: }
// FOREACH-TO-GPU: gpu.barrier
//
- %6 = linalg.matmul ins(%3, %4 : tensor<250x500xf32>, tensor<500x1020xf32>) outs(%5 : tensor<250x1020xf32>) -> tensor<250x1020xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [250, 1020], strides = [1, 1] : tensor<250x1020xf32> -> !flow.dispatch.tensor<readwrite:tensor<250x1020xf32>>
- return
- }
- }
+
+ %7 = linalg.matmul ins(%3, %4 : tensor<250x500xf32>, tensor<500x1020xf32>) outs(%6 : tensor<250x1020xf32>) -> tensor<250x1020xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [250, 1020], strides = [1, 1] : tensor<250x1020xf32> -> !flow.dispatch.tensor<readwrite:tensor<250x1020xf32>>
+ return
}
}
+
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/llvmgpu_bufferize.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/llvmgpu_bufferize.mlir
index b126a80..b88f76a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/llvmgpu_bufferize.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/llvmgpu_bufferize.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --iree-codegen-llvmgpu-bufferization-pipeline --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-llvmgpu-bufferization-pipeline))" --split-input-file %s | FileCheck %s
module {
func.func @bufferize_with_thread_private_memory(%arg0: index) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir
index 0dd78f5..7dbd3ab 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-llvmgpu-configuration-pipeline, iree-codegen-linalg-to-nvvm-pipeline)))' -split-input-file %s -o - | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-nvvm-pipeline)))' -split-input-file %s -o - | FileCheck %s
// This test checks that the lowering of nvvm includes the extraction
// and optimization of address computations.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_mma_sync_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_mma_sync_pipeline_test.mlir
index e9cc87a..8126d5c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_mma_sync_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_mma_sync_pipeline_test.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-llvmgpu-configuration-pipeline, iree-codegen-linalg-to-nvvm-pipeline)))" -iree-codegen-llvmgpu-use-mma-sync %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-nvvm-pipeline)))" -iree-codegen-llvmgpu-use-mma-sync %s | FileCheck %s
// Verify that a simple element wise op gets lowered succefully all the way to
// nvvm/llvm dialect via mma.sync path.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir
index 126dae1..bca6758 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-llvmgpu-configuration-pipeline, iree-codegen-linalg-to-nvvm-pipeline)))" -iree-codegen-llvmgpu-use-wmma %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-nvvm-pipeline)))" -iree-codegen-llvmgpu-use-wmma %s | FileCheck %s
// Verify that a simple element wise op gets lowered succefully all the way to
// nvvm/llvm dialect.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/pack_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/pack_pipeline_test.mlir
index 181e4ad..5789619 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/pack_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/pack_pipeline_test.mlir
@@ -1,32 +1,18 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @static_pack {
-hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) {
- hal.executable.export @static_pack layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @static_pack() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4x16x16x32xi32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
- %3 = tensor.empty() : tensor<4x16x16x32xi32>
- %pack = tensor.pack %2 inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %3 : tensor<128x256xi32> -> tensor<4x16x16x32xi32>
- flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [4, 16, 16, 32], strides = [1, 1, 1, 1] : tensor<4x16x16x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<4x16x16x32xi32>>
- return
- }
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+module {
+ func.func @static_pack() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4x16x16x32xi32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
+ %3 = tensor.empty() : tensor<4x16x16x32xi32>
+ %pack = tensor.pack %2 inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %3 : tensor<128x256xi32> -> tensor<4x16x16x32xi32>
+ flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [4, 16, 16, 32], strides = [1, 1, 1, 1] : tensor<4x16x16x32xi32> -> !flow.dispatch.tensor<writeonly:tensor<4x16x16x32xi32>>
+ return
}
}
-}
// CHECK-LABEL: func.func @static_pack
// CHECK-NOT: vector.transfer_write
// CHECK-NOT: vector.transfer_read
@@ -34,4 +20,3 @@
// CHECK: vector.transfer_read
// CHECK: vector.transpose
// CHECK: vector.transfer_write
-
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir
index 66b410e..348cfc1 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-decompose-softmax)), iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy, iree-codegen-lower-executable-using-transform-dialect, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
@@ -39,11 +39,10 @@
}
}
-// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 4)>
-// CHECK-LABEL: hal.executable.export public @warp_reduction_dispatch
-// CHECK-SAME: subgroup_size = 32
-// CHECK-SAME: workgroup_size = [256 : index, 1 : index, 1 : index]
-// CHECK-LABEL: func.func @warp_reduction_dispatch
+// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 4)>
+// CHECK: #[[TRANSLATION_INFO:.+]] = #iree_codegen.translation_info<None workgroup_size = [256, 1, 1] subgroup_size = 32>
+// CHECK: func.func @warp_reduction_dispatch()
+// CHECK-SAME: translation_info = #[[TRANSLATION_INFO]]
// CHECK-DAG: %[[C0I:.+]] = arith.constant 0 : i32
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : i32
@@ -154,10 +153,9 @@
}
}
-// CHECK-LABEL: hal.executable.export public @warp_reduction_broadcast_dispatch
-// CHECK-SAME: subgroup_size = 32
-// CHECK-SAME: workgroup_size = [512 : index, 1 : index, 1 : index]
-// CHECK-LABEL: func.func @warp_reduction_broadcast_dispatch
+// CHECK: #[[TRANSLATION_INFO:.+]] = #iree_codegen.translation_info<None workgroup_size = [512, 1, 1] subgroup_size = 32>
+// CHECK: func.func @warp_reduction_broadcast_dispatch()
+// CHECK-SAME: translation_info = #[[TRANSLATION_INFO]]
// CHECK: scf.for {{.*}} -> (vector<1xf32>) {
// CHECK: vector.transfer_read {{.*}} : memref<512x10240xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>
// CHECK: vector.reduction <add>, {{.*}} : vector<4xf32> into f32
@@ -229,10 +227,9 @@
}
}
-// CHECK-LABEL: hal.executable.export public @softmax
-// CHECK-SAME: subgroup_size = 32
-// CHECK-SAME: workgroup_size = [1024 : index, 1 : index, 1 : index]
-// CHECK-LABEL: func.func @softmax
+// CHECK: #[[TRANSLATION_INFO:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [1024, 1, 1] subgroup_size = 32>
+// CHECK: func.func @softmax()
+// CHECK-SAME: translation_info = #[[TRANSLATION_INFO]]
// CHECK: scf.for {{.*}} -> (vector<4xf32>) {
// CHECK: vector.transfer_read {{.*}} : memref<12x128x40960xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>
// CHECK: arith.maximumf {{.*}} : vector<4xf32>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir
index 767c02b..c409d52 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir
@@ -1,128 +1,80 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-decompose-softmax)), iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @softmax {
-hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100"}>) {
- hal.executable.export @softmax layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @softmax() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant -3.40282347E+38 : f32
- %cst_0 = arith.constant 0.000000e+00 : f32
- %cst_1 = arith.constant 1.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>> -> tensor<12x128x40960xf32>
- %3 = tensor.empty() : tensor<12x128x40960xf32>
- %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32>
- flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
- return
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100"}>
+module {
+ func.func @softmax() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant -3.40282347E+38 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %cst_1 = arith.constant 1.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>> -> tensor<12x128x40960xf32>
+ %3 = tensor.empty() : tensor<12x128x40960xf32>
+ %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
+ return
}
}
-}
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @softmax
-// CHECK-SAME: subgroup_size = 32
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [1024 : index, 1 : index, 1 : index]
-// CHECK-LABEL: func.func @softmax
-// CHECK-COUNT-20: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}}
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [1024, 1, 1] subgroup_size = 32>
+// CHECK-LABEL: func.func @softmax
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
+// CHECK-COUNT-20: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}}
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @softmax {
-hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>) {
- hal.executable.export @softmax layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @softmax() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant -3.40282347E+38 : f32
- %cst_0 = arith.constant 0.000000e+00 : f32
- %cst_1 = arith.constant 1.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>> -> tensor<12x128x40960xf32>
- %3 = tensor.empty() : tensor<12x128x40960xf32>
- %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32>
- flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
- return
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx940"}>
+module {
+ func.func @softmax() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant -3.40282347E+38 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %cst_1 = arith.constant 1.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>> -> tensor<12x128x40960xf32>
+ %3 = tensor.empty() : tensor<12x128x40960xf32>
+ %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
+ return
}
}
-}
// On CDNA, we prefer wave64 with subgroup size 64.
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @softmax
-// CHECK-SAME: subgroup_size = 64
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [1024 : index, 1 : index, 1 : index]
-// CHECK-LABEL: func.func @softmax
-// CHECK-COUNT-20: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}}
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [1024, 1, 1] subgroup_size = 64>
+// CHECK: func.func @softmax
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
+// CHECK-COUNT-20: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}}
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 2, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer, ReadOnly>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-
-hal.executable private @dynamic_softmax {
- hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100"}>) {
- hal.executable.export public @dynamic_softmax ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @dynamic_softmax() {
- %c32_i64 = arith.constant 32 : i64
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = arith.extui %0 : i32 to i64
- %3 = arith.extui %1 : i32 to i64
- %4 = arith.shli %3, %c32_i64 : i64
- %5 = arith.ori %2, %4 : i64
- %6 = arith.index_castui %5 : i64 to index
- %7 = flow.dispatch.workload.ordinal %6, 0 : index
- %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x?xf16>>{%7}
- %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x?xf16>>{%7}
- %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x?xf16>>{%7} -> tensor<32x?xf16>
- %11 = tensor.empty(%7) : tensor<32x?xf16>
- %12 = linalg.softmax dimension(1) ins(%10 : tensor<32x?xf16>) outs(%11 : tensor<32x?xf16>) -> tensor<32x?xf16>
- flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : tensor<32x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<32x?xf16>>{%7}
- return
- }
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100"}>
+module {
+ func.func @dynamic_softmax() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c32_i64 = arith.constant 32 : i64
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.extui %0 : i32 to i64
+ %3 = arith.extui %1 : i32 to i64
+ %4 = arith.shli %3, %c32_i64 : i64
+ %5 = arith.ori %2, %4 : i64
+ %6 = arith.index_castui %5 : i64 to index
+ %7 = flow.dispatch.workload.ordinal %6, 0 : index
+ %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x?xf16>>{%7}
+ %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x?xf16>>{%7}
+ %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x?xf16>>{%7} -> tensor<32x?xf16>
+ %11 = tensor.empty(%7) : tensor<32x?xf16>
+ %12 = linalg.softmax dimension(1) ins(%10 : tensor<32x?xf16>) outs(%11 : tensor<32x?xf16>) -> tensor<32x?xf16>
+ flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : tensor<32x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<32x?xf16>>{%7}
+ return
}
}
+
// Finer details of this lowering are captured by the spirv pipeline test. Just
// verify that warp reduction triggers.
-// CHECK-LABEL: func.func @dynamic_softmax
+// CHECK-LABEL: func.func @dynamic_softmax
// CHECK-COUNT-10: gpu.shuffle xor {{.*}} : i32
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_cuda.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_cuda.mlir
index 4fdb9dd..f41da6b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_cuda.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_cuda.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, iree-codegen-lower-executable-using-transform-dialect, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s
hal.executable @small_reduction {
hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_rocm.mlir
index ba5962c..c482917 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_rocm.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_rocm.mlir
@@ -1,5 +1,5 @@
// RUN: iree-opt --split-input-file \
-// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))" \
+// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" \
// RUN: %s | FileCheck %s
hal.executable @group_reduction_1d {
@@ -30,12 +30,9 @@
}
}
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @group_reduction_1d
-// CHECK-SAME: subgroup_size = 32
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 1 : index, 1 : index]
-//CHECK-LABEL: func.func @group_reduction_1d
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [32, 1, 1] subgroup_size = 32>
+// CHECK: func.func @group_reduction_1d()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
// -----
@@ -70,12 +67,8 @@
// On CDNA, we prefer wave64 with subgroup size of 64.
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @group_reduction_1d
-// CHECK-SAME: subgroup_size = 64
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
-//CHECK-LABEL: func.func @group_reduction_1d
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
+// CHECK: func.func @group_reduction_1d
// CHECK-COUNT-5: gpu.shuffle xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
// -----
@@ -124,13 +117,9 @@
}
}
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @i4_dequant_matvec
-// CHECK-SAME: subgroup_size = 32
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
-//
-// CHECK-LABEL: func.func @i4_dequant_matvec()
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 32>
+// CHECK: func.func @i4_dequant_matvec()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1x8xf16>
// CHECK: %[[FOR:.+]] = scf.for %{{.+}} = %c0 to %c32 step %c4 iter_args(%[[ARG:.+]] = %[[CST]]) -> (vector<1x8xf16>)
// CHECK: %[[READ0:.+]] = vector.transfer_read {{.+}} : memref<4096x32x128xi4, #hal.descriptor_type<storage_buffer>>, vector<1x8xi4>
@@ -196,12 +185,9 @@
}
}
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @i4_dequant_matvec
-// CHECK-SAME: subgroup_size = 64
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
-// CHECK-LABEL: func.func @i4_dequant_matvec()
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
+// CHECK: func.func @i4_dequant_matvec()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// -----
@@ -241,13 +227,9 @@
// write 8 results at the end.
// TODO(kuhar): We should reduce the number of `gpu.shuffles` performed.
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @matvec_fp16
-// CHECK-SAME: subgroup_size = 64
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
-//
-// CHECK-LABEL: func.func @matvec_fp16()
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 64>
+// CHECK: func.func @matvec_fp16()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C512:.+]] = arith.constant 512 : index
// CHECK-DAG: %[[C4096:.+]] = arith.constant 4096 : index
@@ -299,13 +281,9 @@
// Multi-row matvec with wave32.
// TODO(kuhar): We should reduce the number of `gpu.shuffles` performed.
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction>
-// CHECK-LABEL: hal.executable.export public @matvec_fp16
-// CHECK-SAME: subgroup_size = 32
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
-//
-// CHECK-LABEL: func.func @matvec_fp16()
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUWarpReduction workgroup_size = [64, 1, 1] subgroup_size = 32>
+// CHECK: func.func @matvec_fp16()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C512:.+]] = arith.constant 512 : index
// CHECK-DAG: %[[C4096:.+]] = arith.constant 4096 : index
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
index 3a5d17a..cdf3659 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-llvmgpu-configuration-pipeline, iree-codegen-linalg-to-rocdl-pipeline)))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline)))" %s | FileCheck %s
// Verify that a simple element wise op gets lowered succefully all the way to
// nvvm/llvm dialect.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir
index 136285c..e086c1f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir
@@ -1,8 +1,8 @@
-// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy)))" \
+// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=1 --iree-codegen-llvmgpu-enable-transform-dialect-batch-matmul-strategy |\
// RUN: FileCheck %s --check-prefixes=CHECK,DEFAULT
-// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy)))" \
+// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=1 --iree-codegen-llvmgpu-enable-transform-dialect-batch-matmul-strategy \
// RUN: -td-matmul-strategy-blk-sizes=128,64,32,2 \
// RUN: -td-matmul-strategy-reduc-size=8 \
@@ -18,35 +18,25 @@
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
-hal.executable private @batch_matmul_dispatch_0 {
- hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export public @batch_matmul_dispatch_0_generic_128x80x320x32_f32 ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @batch_matmul_dispatch_0_generic_128x80x320x32_f32() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x80x32xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x32x320xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x80x320xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [128, 80, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x80x32xf32>> -> tensor<128x80x32xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [128, 32, 320], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x32x320xf32>> -> tensor<128x32x320xf32>
- %5 = tensor.empty() : tensor<128x80x320xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x80x320xf32>) -> tensor<128x80x320xf32>
- %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<128x80x32xf32>, tensor<128x32x320xf32>) outs(%6 : tensor<128x80x320xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %8 = arith.mulf %in, %in_0 : f32
- %9 = arith.addf %out, %8 : f32
- linalg.yield %9 : f32
- } -> tensor<128x80x320xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [128, 80, 320], strides = [1, 1, 1] : tensor<128x80x320xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x80x320xf32>>
- return
- }
- }
+module {
+ func.func @batch_matmul_dispatch_0_generic_128x80x320x32_f32() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x80x32xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x32x320xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x80x320xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [128, 80, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x80x32xf32>> -> tensor<128x80x32xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [128, 32, 320], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x32x320xf32>> -> tensor<128x32x320xf32>
+ %5 = tensor.empty() : tensor<128x80x320xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x80x320xf32>) -> tensor<128x80x320xf32>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<128x80x32xf32>, tensor<128x32x320xf32>) outs(%6 : tensor<128x80x320xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %8 = arith.mulf %in, %in_0 : f32
+ %9 = arith.addf %out, %8 : f32
+ linalg.yield %9 : f32
+ } -> tensor<128x80x320xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [128, 80, 320], strides = [1, 1, 1] : tensor<128x80x320xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x80x320xf32>>
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir
index 3345f41..65fadce 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir
@@ -1,30 +1,21 @@
-// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy)))" --iree-codegen-llvmgpu-enable-transform-dialect-implicit-gemm-strategy | FileCheck %s
+// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" --iree-codegen-llvmgpu-enable-transform-dialect-implicit-gemm-strategy | FileCheck %s
-hal.executable @nchw_convolution {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @nchw_convolution ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @nchw_convolution() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x128x258x258xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128x3x3xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8x256x256x256xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 128, 258, 258], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x128x258x258xf32>> -> tensor<8x128x258x258xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [256, 128, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128x3x3xf32>> -> tensor<256x128x3x3xf32>
+ %5 = tensor.empty() : tensor<8x256x256x256xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32>
+ %7 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x128x258x258xf32>, tensor<256x128x3x3xf32>) outs(%6 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 256], strides = [1, 1, 1, 1] : tensor<8x256x256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<8x256x256x256xf32>>
+ return
}
- builtin.module {
- func.func @nchw_convolution() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x128x258x258xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x128x3x3xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8x256x256x256xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 128, 258, 258], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x128x258x258xf32>> -> tensor<8x128x258x258xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [256, 128, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128x3x3xf32>> -> tensor<256x128x3x3xf32>
- %5 = tensor.empty() : tensor<8x256x256x256xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32>
- %7 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> }
- ins(%3, %4 : tensor<8x128x258x258xf32>, tensor<256x128x3x3xf32>) outs(%6 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 256], strides = [1, 1, 1, 1] : tensor<8x256x256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<8x256x256x256xf32>>
- return
- }
- }
-}
}
// CHECK-LABEL: func @nchw_convolution
@@ -71,31 +62,22 @@
// -----
-hal.executable @nhwc_convolution {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @nhwc_convolution ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @nhwc_convolution() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x258x258x128xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3x3x128x256xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8x256x256x256xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 258, 258, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x258x258x128xf32>> -> tensor<8x258x258x128xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 128, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x128x256xf32>> -> tensor<3x3x128x256xf32>
+ %5 = tensor.empty() : tensor<8x256x256x256xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32>
+ %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x258x258x128xf32>, tensor<3x3x128x256xf32>) outs(%6 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 256], strides = [1, 1, 1, 1] : tensor<8x256x256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<8x256x256x256xf32>>
+ return
}
- builtin.module {
- func.func @nhwc_convolution() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x258x258x128xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3x3x128x256xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8x256x256x256xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 258, 258, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x258x258x128xf32>> -> tensor<8x258x258x128xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 128, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x128x256xf32>> -> tensor<3x3x128x256xf32>
- %5 = tensor.empty() : tensor<8x256x256x256xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32>
- %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> }
- ins(%3, %4 : tensor<8x258x258x128xf32>, tensor<3x3x128x256xf32>) outs(%6 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 256], strides = [1, 1, 1, 1] : tensor<8x256x256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<8x256x256x256xf32>>
- return
- }
- }
-}
}
// CHECK-LABEL: func @nhwc_convolution
@@ -114,37 +96,27 @@
// CHECK: transform.structured.tile_using_forall %{{.*}} num_threads [1, 2, 2](mapping = [#gpu.warp<z>, #gpu.warp<y>, #gpu.warp<x>])
// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1]
-
// -----
-hal.executable @unaligned_convolution {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @unaligned_convolution ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @unaligned_convolution() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x258x258x132xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3x3x132x264xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8x256x256x264xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 258, 258, 132], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x258x258x132xf32>> -> tensor<8x258x258x132xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 132, 264], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x132x264xf32>> -> tensor<3x3x132x264xf32>
+ %5 = tensor.empty() : tensor<8x256x256x264xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x264xf32>) -> tensor<8x256x256x264xf32>
+ %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x258x258x132xf32>, tensor<3x3x132x264xf32>) outs(%6 : tensor<8x256x256x264xf32>) -> tensor<8x256x256x264xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 264], strides = [1, 1, 1, 1] : tensor<8x256x256x264xf32> -> !flow.dispatch.tensor<writeonly:tensor<8x256x256x264xf32>>
+ return
}
- builtin.module {
- func.func @unaligned_convolution() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x258x258x132xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3x3x132x264xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8x256x256x264xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 258, 258, 132], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x258x258x132xf32>> -> tensor<8x258x258x132xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 132, 264], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x132x264xf32>> -> tensor<3x3x132x264xf32>
- %5 = tensor.empty() : tensor<8x256x256x264xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x264xf32>) -> tensor<8x256x256x264xf32>
- %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> }
- ins(%3, %4 : tensor<8x258x258x132xf32>, tensor<3x3x132x264xf32>) outs(%6 : tensor<8x256x256x264xf32>) -> tensor<8x256x256x264xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 264], strides = [1, 1, 1, 1] : tensor<8x256x256x264xf32> -> !flow.dispatch.tensor<writeonly:tensor<8x256x256x264xf32>>
- return
- }
- }
-}
}
-// CHECK: #iree_codegen.translation_info<LLVMGPUVectorize>
+// CHECK: #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [2, 4, 4]>
// CHECK-LABEL: func @unaligned_convolution
// Currently padding on the img2col op is not supported so bail out for unaligned.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir
index 8b3d376..80bf0e1 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir
@@ -1,9 +1,9 @@
-// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy)))" \
+// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-aligned-matmul | FileCheck %s
// Check that setting the command line options affect the transform
// strategy as expected.
-// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy)))" \
+// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
// RUN: -td-matmul-strategy-blk-sizes=256,64,1 \
// RUN: -td-matmul-strategy-reduc-size=8 \
// RUN: -td-matmul-strategy-num-threads=32,4,1 \
@@ -14,7 +14,7 @@
// RUN: | FileCheck --check-prefix=WITH_OPTIONS %s
// Check that various more exotic strategies apply properly e2e but without otherwise checking their content.
-// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy)))" \
+// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-aligned-matmul \
// RUN: -td-matmul-strategy-blk-sizes=16,16,1 \
// RUN: -td-matmul-strategy-reduc-size=16 \
@@ -26,7 +26,7 @@
// RUN: | FileCheck --check-prefix=WITH_OPTIONS_2 %s
// Check that various more exotic strategies apply properly e2e but without otherwise checking their content.
-// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy)))" \
+// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-aligned-matmul \
// RUN: -td-matmul-strategy-blk-sizes=128,64,1 \
// RUN: -td-matmul-strategy-reduc-size=16 \
@@ -37,33 +37,25 @@
// RUN: -td-matmul-strategy-pipeline-depth=3 \
// RUN: | FileCheck --check-prefix=WITH_OPTIONS_3 %s
-// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy)))" \
+// RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-small-matmul | FileCheck --check-prefix=SMALL %s
-hal.executable @matmul_1 {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @matmul_1 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @matmul_1() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2052x2556xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2052xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2052x2052xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2052x2556xf32>> -> tensor<2052x2556xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2052xf32>> -> tensor<2556x2052xf32>
+ %5 = tensor.empty() : tensor<2052x2052xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf32>, tensor<2556x2052xf32>) outs(%6 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf32> -> !flow.dispatch.tensor<writeonly:tensor<2052x2052xf32>>
+ return
}
- builtin.module {
- func.func @matmul_1() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2052x2556xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2052xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2052x2052xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2052x2556xf32>> -> tensor<2052x2556xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2052xf32>> -> tensor<2556x2052xf32>
- %5 = tensor.empty() : tensor<2052x2052xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32>
- %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf32>, tensor<2556x2052xf32>) outs(%6 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf32> -> !flow.dispatch.tensor<writeonly:tensor<2052x2052xf32>>
- return
- }
- }
-}
}
// CHECK-LABEL: func @matmul_1
@@ -206,30 +198,22 @@
// -----
-hal.executable @matmul_2 {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @matmul_2 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @matmul_2() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2051x2555xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2555x2050xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2051x2050xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2051, 2555], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2051x2555xf32>> -> tensor<2051x2555xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2555, 2051], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2555x2050xf32>> -> tensor<2555x2050xf32>
+ %5 = tensor.empty() : tensor<2051x2050xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2051x2050xf32>) -> tensor<2051x2050xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<2051x2555xf32>, tensor<2555x2050xf32>) outs(%6 : tensor<2051x2050xf32>) -> tensor<2051x2050xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2051, 2050], strides = [1, 1] : tensor<2051x2050xf32> -> !flow.dispatch.tensor<writeonly:tensor<2051x2050xf32>>
+ return
}
- builtin.module {
- func.func @matmul_2() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2051x2555xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2555x2050xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2051x2050xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2051, 2555], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2051x2555xf32>> -> tensor<2051x2555xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2555, 2051], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2555x2050xf32>> -> tensor<2555x2050xf32>
- %5 = tensor.empty() : tensor<2051x2050xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2051x2050xf32>) -> tensor<2051x2050xf32>
- %7 = linalg.matmul ins(%3, %4 : tensor<2051x2555xf32>, tensor<2555x2050xf32>) outs(%6 : tensor<2051x2050xf32>) -> tensor<2051x2050xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2051, 2050], strides = [1, 1] : tensor<2051x2050xf32> -> !flow.dispatch.tensor<writeonly:tensor<2051x2050xf32>>
- return
- }
- }
-}
}
// CHECK-LABEL: func @matmul_2
@@ -260,30 +244,22 @@
// -----
-hal.executable @matmul_3 {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @matmul_3 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @matmul_3() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x2556xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2556xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2556xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2556xf32>> -> tensor<2048x2556xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2556xf32>> -> tensor<2556x2556xf32>
+ %5 = tensor.empty() : tensor<2048x2556xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x2556xf32>) -> tensor<2048x2556xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<2048x2556xf32>, tensor<2556x2556xf32>) outs(%6 : tensor<2048x2556xf32>) -> tensor<2048x2556xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2556], strides = [1, 1] : tensor<2048x2556xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2556xf32>>
+ return
}
- builtin.module {
- func.func @matmul_3() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x2556xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2556xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2556xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2556xf32>> -> tensor<2048x2556xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2556xf32>> -> tensor<2556x2556xf32>
- %5 = tensor.empty() : tensor<2048x2556xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x2556xf32>) -> tensor<2048x2556xf32>
- %7 = linalg.matmul ins(%3, %4 : tensor<2048x2556xf32>, tensor<2556x2556xf32>) outs(%6 : tensor<2048x2556xf32>) -> tensor<2048x2556xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2556], strides = [1, 1] : tensor<2048x2556xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2556xf32>>
- return
- }
- }
-}
}
// CHECK-LABEL: func @matmul_3
@@ -295,30 +271,23 @@
// WITH_OPTIONS_3-LABEL: func @matmul_3
// -----
-hal.executable @matmul_4_partially_unaligned {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @matmul_4_partially_unaligned ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @matmul_4_partially_unaligned() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x2044xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2044x1024xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x1024xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2044xf32>> -> tensor<2048x2044xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2044x1024xf32>> -> tensor<2044x1024xf32>
+ %5 = tensor.empty() : tensor<2048x1024xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x1024xf32>) -> tensor<2048x1024xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<2048x2044xf32>, tensor<2044x1024xf32>) outs(%6 : tensor<2048x1024xf32>) -> tensor<2048x1024xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : tensor<2048x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x1024xf32>>
+ return
}
- builtin.module {
- func.func @matmul_4_partially_unaligned() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x2044xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2044x1024xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x1024xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2044xf32>> -> tensor<2048x2044xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2044x1024xf32>> -> tensor<2044x1024xf32>
- %5 = tensor.empty() : tensor<2048x1024xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x1024xf32>) -> tensor<2048x1024xf32>
- %7 = linalg.matmul ins(%3, %4 : tensor<2048x2044xf32>, tensor<2044x1024xf32>) outs(%6 : tensor<2048x1024xf32>) -> tensor<2048x1024xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : tensor<2048x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x1024xf32>>
- return
- }
- }
-}
}
// CHECK-LABEL: func @matmul_4_partially_unaligned
@@ -366,30 +335,23 @@
// WITH_OPTIONS_3-LABEL: func @matmul_4_partially_unaligned
// -----
-hal.executable @aligned_matmul {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @aligned_matmul ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @aligned_matmul() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x2048xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x2048xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2048xf32>> -> tensor<2048x2048xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2048xf32>> -> tensor<2048x2048xf32>
+ %5 = tensor.empty() : tensor<2048x2048xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x2048xf32>) -> tensor<2048x2048xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<2048x2048xf32>, tensor<2048x2048xf32>) outs(%6 : tensor<2048x2048xf32>) -> tensor<2048x2048xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xf32>>
+ return
}
- builtin.module {
- func.func @aligned_matmul() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x2048xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x2048xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x2048xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2048xf32>> -> tensor<2048x2048xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x2048xf32>> -> tensor<2048x2048xf32>
- %5 = tensor.empty() : tensor<2048x2048xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x2048xf32>) -> tensor<2048x2048xf32>
- %7 = linalg.matmul ins(%3, %4 : tensor<2048x2048xf32>, tensor<2048x2048xf32>) outs(%6 : tensor<2048x2048xf32>) -> tensor<2048x2048xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x2048xf32>>
- return
- }
- }
-}
}
// CHECK-LABEL: func @aligned_matmul
@@ -437,33 +399,25 @@
// -----
-hal.executable @matmul_5_small {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @matmul_5_small ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @matmul_5_small() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x2044xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2044x1024xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1024xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 2044], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2044xf32>> -> tensor<2x2044xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2044, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2044x1024xf32>> -> tensor<2044x1024xf32>
+ %5 = tensor.empty() : tensor<2x1024xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x1024xf32>) -> tensor<2x1024xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<2x2044xf32>, tensor<2044x1024xf32>) outs(%6 : tensor<2x1024xf32>) -> tensor<2x1024xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2, 1024], strides = [1, 1] : tensor<2x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1024xf32>>
+ return
}
- builtin.module {
- func.func @matmul_5_small() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x2044xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2044x1024xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1024xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 2044], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2044xf32>> -> tensor<2x2044xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2044, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2044x1024xf32>> -> tensor<2044x1024xf32>
- %5 = tensor.empty() : tensor<2x1024xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x1024xf32>) -> tensor<2x1024xf32>
- %7 = linalg.matmul ins(%3, %4 : tensor<2x2044xf32>, tensor<2044x1024xf32>) outs(%6 : tensor<2x1024xf32>) -> tensor<2x1024xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2, 1024], strides = [1, 1] : tensor<2x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1024xf32>>
- return
- }
- }
-}
}
-// CHECK: iree_codegen.translation_info<LLVMGPUVectorize>
+// CHECK: iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [64, 1, 1] subgroup_size = 32>
// CHECK-LABEL: func @matmul_5_small
// This matmul is considered "too small"/"degenerate" for a tensor core strategy,
@@ -480,33 +434,25 @@
// -----
-hal.executable @f16_matmul {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @f16_matmul ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @f16_matmul() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2052x2556xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2052xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2052x2052xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2052x2556xf16>> -> tensor<2052x2556xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2052xf16>> -> tensor<2556x2052xf16>
+ %5 = tensor.empty() : tensor<2052x2052xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2052x2052xf16>) -> tensor<2052x2052xf16>
+ %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf16>, tensor<2556x2052xf16>) outs(%6 : tensor<2052x2052xf16>) -> tensor<2052x2052xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf16> -> !flow.dispatch.tensor<writeonly:tensor<2052x2052xf16>>
+ return
}
- builtin.module {
- func.func @f16_matmul() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2052x2556xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2052xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2052x2052xf16>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2052x2556xf16>> -> tensor<2052x2556xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2052xf16>> -> tensor<2556x2052xf16>
- %5 = tensor.empty() : tensor<2052x2052xf16>
- %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2052x2052xf16>) -> tensor<2052x2052xf16>
- %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf16>, tensor<2556x2052xf16>) outs(%6 : tensor<2052x2052xf16>) -> tensor<2052x2052xf16>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf16> -> !flow.dispatch.tensor<writeonly:tensor<2052x2052xf16>>
- return
- }
- }
-}
}
-// CHECK: iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+// CHECK: iree_codegen.translation_info<LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
// CHECK-LABEL: func @f16_matmul
// CHECK-NOT: transform.sequence
// CHECK-NOT: transform.named_sequence
@@ -515,33 +461,24 @@
// WITH_OPTIONS_3-LABEL: func @f16_matmul
-
// -----
-hal.executable @int8_matmul {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @int8_matmul ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @int8_matmul() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %c0_i8 = arith.constant 0 : i8
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4x2556xi8>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2052xi8>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4x2052xi8>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x2556xi8>> -> tensor<4x2556xi8>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2052xi8>> -> tensor<2556x2052xi8>
+ %5 = tensor.empty() : tensor<4x2052xi8>
+ %6 = linalg.fill ins(%c0_i8 : i8) outs(%5 : tensor<4x2052xi8>) -> tensor<4x2052xi8>
+ %7 = linalg.matmul ins(%3, %4 : tensor<4x2556xi8>, tensor<2556x2052xi8>) outs(%6 : tensor<4x2052xi8>) -> tensor<4x2052xi8>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [4, 2052], strides = [1, 1] : tensor<4x2052xi8> -> !flow.dispatch.tensor<writeonly:tensor<4x2052xi8>>
+ return
}
- builtin.module {
- func.func @int8_matmul() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0 : i8
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4x2556xi8>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2052xi8>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4x2052xi8>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x2556xi8>> -> tensor<4x2556xi8>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2052xi8>> -> tensor<2556x2052xi8>
- %5 = tensor.empty() : tensor<4x2052xi8>
- %6 = linalg.fill ins(%cst : i8) outs(%5 : tensor<4x2052xi8>) -> tensor<4x2052xi8>
- %7 = linalg.matmul ins(%3, %4 : tensor<4x2556xi8>, tensor<2556x2052xi8>) outs(%6 : tensor<4x2052xi8>) -> tensor<4x2052xi8>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [4, 2052], strides = [1, 1] : tensor<4x2052xi8> -> !flow.dispatch.tensor<writeonly:tensor<4x2052xi8>>
- return
- }
- }
-}
}
// SMALL-LABEL: func @int8_matmul
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
index fb56ef9..7b30b67 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir
@@ -1,12 +1,12 @@
// RUN: iree-opt %s --split-input-file \
-// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy)))" \
+// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-pad-strategy \
// RUN: | FileCheck %s
// Check that setting the command line options affect the transform
// strategy as expected.
// RUN: iree-opt %s --split-input-file \
-// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy)))" \
+// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-pad-strategy \
// RUN: --td-pad-strategy-blk-sizes=16,32,1 \
// RUN: --td-pad-strategy-num-threads=8,4,1 \
@@ -14,33 +14,23 @@
// RUN: --td-pad-strategy-use-async-copies=false \
// RUN: | FileCheck --check-prefix=WITH_OPTIONS %s
-hal.executable @pad {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @pad ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @pad() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %c56 = arith.constant 56 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<123x456xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<123x456xf32>> -> tensor<123x456xf32>
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %padded = tensor.pad %2 low[%c0, 0] high[5, %c56] {
+ ^bb0(%arg0: index, %arg1: index):
+ tensor.yield %cst_0 : f32
+ } : tensor<123x456xf32> to tensor<128x512xf32>
+ flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
+ return
}
- builtin.module {
- func.func @pad() {
- %c0 = arith.constant 0 : index
- %c56 = arith.constant 56 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<123x456xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<123x456xf32>> -> tensor<123x456xf32>
-
- %pad = arith.constant 0.0 : f32
- %padded = tensor.pad %3 low[%c0, 0] high[5, %c56] {
- ^bb0(%arg1: index, %arg2: index):
- tensor.yield %pad : f32
- } : tensor<123x456xf32> to tensor<128x512xf32>
-
- flow.dispatch.tensor.store %padded, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
- return
- }
- }
-}
}
// CHECK-LABEL: func @pad
@@ -103,32 +93,22 @@
// -----
-hal.executable @pad_low {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @pad_low ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @pad_low() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<123x456xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<123x456xf32>> -> tensor<123x456xf32>
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %padded = tensor.pad %2 low[5, 0] high[0, 56] {
+ ^bb0(%arg0: index, %arg1: index):
+ tensor.yield %cst_0 : f32
+ } : tensor<123x456xf32> to tensor<128x512xf32>
+ flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
+ return
}
- builtin.module {
- func.func @pad_low() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<123x456xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<123x456xf32>> -> tensor<123x456xf32>
-
- %pad = arith.constant 0.0 : f32
- %padded = tensor.pad %3 low[5, 0] high[0, 56] {
- ^bb0(%arg1: index, %arg2: index):
- tensor.yield %pad : f32
- } : tensor<123x456xf32> to tensor<128x512xf32>
-
- flow.dispatch.tensor.store %padded, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
- return
- }
- }
-}
}
// The strategy doesn't apply for low padding.
@@ -139,33 +119,23 @@
// -----
-hal.executable @pad_local {
-hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
- hal.executable.export public @pad_local ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
+module {
+ func.func @pad_local() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<123x456xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<123x456xf32>> -> tensor<123x456xf32>
+ %padded = tensor.pad %2 low[0, 0] high[5, 56] {
+ ^bb0(%arg0: index, %arg1: index):
+ %3 = arith.index_cast %arg0 : index to i64
+ %4 = arith.uitofp %3 : i64 to f32
+ tensor.yield %4 : f32
+ } : tensor<123x456xf32> to tensor<128x512xf32>
+ flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
+ return
}
- builtin.module {
- func.func @pad_local() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<123x456xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<123x456xf32>> -> tensor<123x456xf32>
-
- %padded = tensor.pad %3 low[0, 0] high[5, 56] {
- ^bb0(%arg1: index, %arg2: index):
- %5 = arith.index_cast %arg1 : index to i64
- %pad = arith.uitofp %5 : i64 to f32
- tensor.yield %pad : f32
- } : tensor<123x456xf32> to tensor<128x512xf32>
-
- flow.dispatch.tensor.store %padded, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x512xf32>>
- return
- }
- }
-}
}
// The strategy doesn't apply for local pad values.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir
index 4468fd0..82f73e6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir
@@ -1,40 +1,35 @@
// RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule | FileCheck %s
-hal.executable private @pad_matmul_static_dispatch_0 {
- builtin.module {
- func.func @pad_matmul_static_dispatch_0() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<250x500xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<500x1020xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<250x1020xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [250, 500], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<250x500xf32>> -> tensor<250x500xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [500, 1020], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<500x1020xf32>> -> tensor<500x1020xf32>
+func.func @pad_matmul_static_dispatch_0() {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<250x500xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<500x1020xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<250x1020xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [250, 500], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<250x500xf32>> -> tensor<250x500xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [500, 1020], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<500x1020xf32>> -> tensor<500x1020xf32>
- %50 = tensor.empty() : tensor<250x1020xf32>
- %cst = arith.constant 0.000000e+00 : f32
- %5 = linalg.fill ins(%cst : f32) outs(%50 : tensor<250x1020xf32>) -> tensor<250x1020xf32>
+ %50 = tensor.empty() : tensor<250x1020xf32>
+ %cst = arith.constant 0.000000e+00 : f32
+ %5 = linalg.fill ins(%cst : f32) outs(%50 : tensor<250x1020xf32>) -> tensor<250x1020xf32>
- // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%{{.*}} : memref<250x1020xf32, #hal.descriptor_type<storage_buffer>>)
- // CHECK: memref.alloc() {alignment = 64 : i64} : memref<250x500xf32, #gpu.address_space<workgroup>>
- // CHECK: gpu.barrier
- // CHECK: linalg.generic
- // CHECK: gpu.barrier
- // CHECK-NEXT: linalg.matmul{{.*}}ins(%{{.*}} : memref<250x500xf32, #gpu.address_space<workgroup>>, memref<500x1020xf32, #hal.descriptor_type<storage_buffer>>) outs(%{{.*}} : memref<250x1020xf32, #hal.descriptor_type<storage_buffer>>)
- %p = bufferization.alloc_tensor() copy(%3) : tensor<250x500xf32>
- %6 = linalg.matmul ins(%p, %4 : tensor<250x500xf32>, tensor<500x1020xf32>) outs(%5 : tensor<250x1020xf32>) -> tensor<250x1020xf32>
+ // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%{{.*}} : memref<250x1020xf32, #hal.descriptor_type<storage_buffer>>)
+ // CHECK: memref.alloc() {alignment = 64 : i64} : memref<250x500xf32, #gpu.address_space<workgroup>>
+ // CHECK: gpu.barrier
+ // CHECK: linalg.generic
+ // CHECK: gpu.barrier
+ // CHECK-NEXT: linalg.matmul{{.*}}ins(%{{.*}} : memref<250x500xf32, #gpu.address_space<workgroup>>, memref<500x1020xf32, #hal.descriptor_type<storage_buffer>>) outs(%{{.*}} : memref<250x1020xf32, #hal.descriptor_type<storage_buffer>>)
+ %p = bufferization.alloc_tensor() copy(%3) : tensor<250x500xf32>
+ %6 = linalg.matmul ins(%p, %4 : tensor<250x500xf32>, tensor<500x1020xf32>) outs(%5 : tensor<250x1020xf32>) -> tensor<250x1020xf32>
- flow.dispatch.tensor.store %6, %2, offsets=[0, 0], sizes=[250, 1020], strides=[1, 1] : tensor<250x1020xf32> -> !flow.dispatch.tensor<readwrite:tensor<250x1020xf32>>
- return
- }
- }
-
- builtin.module attributes { transform.with_named_sequence } {
- transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.consumed}) {
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op: (!transform.any_op) -> !transform.any_op
- %func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.yield
- } // @__transform_main
- } // module
-
+ flow.dispatch.tensor.store %6, %2, offsets=[0, 0], sizes=[250, 1020], strides=[1, 1] : tensor<250x1020xf32> -> !flow.dispatch.tensor<readwrite:tensor<250x1020xf32>>
+ return
}
+
+builtin.module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.consumed}) {
+ %func = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.iree.eliminate_empty_tensors %func : (!transform.any_op) -> ()
+ %_ = transform.iree.bufferize { target_gpu } %func: (!transform.any_op) -> !transform.any_op
+ transform.yield
+ } // @__transform_main
+} // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
index 3a47a94..c77fe84 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
@@ -1,14 +1,13 @@
module attributes { transform.with_named_sequence } {
transform.named_sequence @__transform_main(
- %variant_op: !transform.any_op {transform.consumed}) {
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %variant_op: !transform.any_op) {
+ %tensor_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.iree.eliminate_empty_tensors %tensor_func : (!transform.any_op) -> ()
+ %memref_func = transform.iree.bufferize %tensor_func : (!transform.any_op) -> !transform.any_op
// Annotate the exported function as already translated.
- %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
%none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
- transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+ transform.annotate %memref_func "translation_info" = %none : !transform.any_op, !transform.any_param
transform.yield
}
} // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
index b15fe98..88fbb50 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
@@ -1,6 +1,6 @@
module attributes { transform.with_named_sequence } {
transform.named_sequence @__transform_main(
- %variant_op: !transform.any_op {transform.consumed}) {
+ %variant_op: !transform.any_op) {
%0 = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%forall, %tiled_fill = transform.structured.tile_using_forall %0 num_threads [5, 1]
( mapping = [#gpu.thread<y>, #gpu.thread<x>] )
@@ -25,9 +25,8 @@
transform.apply_patterns.canonicalization
} : !transform.any_op
transform.apply_cse to %func : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
- %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.iree.eliminate_empty_tensors %func : (!transform.any_op) -> ()
+ %memref_func = transform.iree.bufferize %func : (!transform.any_op) -> (!transform.any_op)
transform.iree.map_nested_forall_to_gpu_threads %memref_func
workgroup_dims = [10, 11, 1] : (!transform.any_op) -> ()
@@ -40,11 +39,6 @@
} : !transform.any_op
transform.iree.apply_licm %memref_func : !transform.any_op
transform.apply_cse to %memref_func : !transform.any_op
-
- // Annotate the exported function as already translated.
- %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
- transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
transform.yield
}
} // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir
index b1c2065..90978d2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir
@@ -1,60 +1,55 @@
-// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(iree-transform-dialect-interpreter))" \
+// RUN: iree-opt %s --pass-pipeline="builtin.module(iree-transform-dialect-interpreter)" \
// RUN: --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_vector_warp_execute_on_lane_0_spec.mlir \
// RUN: --allow-unregistered-dialect | \
// RUN: FileCheck %s --check-prefix=WARP-EXECUTE
-// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(iree-transform-dialect-interpreter))" \
+// RUN: iree-opt %s --pass-pipeline="builtin.module(iree-transform-dialect-interpreter)" \
// RUN: --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_vector_distribution_spec.mlir \
// RUN: --allow-unregistered-dialect | \
// RUN: FileCheck %s
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>]>]>
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
+#translation_info = #iree_codegen.translation_info<None workgroup_size = [64, 1, 1] subgroup_size = 32>
+module {
+ func.func @reduce_dispatch_0() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation_info} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128xf32>
+ memref.assume_alignment %0, 64 : memref<128xf32>
+ %1 = gpu.thread_id x
+ %2 = arith.cmpi ult, %1, %c1 : index
-hal.executable private @reduce_dispatch_0 {
- hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) {
- hal.executable.export public @reduce_dispatch_0 ordinal(0) layout(#pipeline_layout) attributes { workgroup_size = [64: index, 1: index, 1: index], subgroup_size = 32 : index }
- builtin.module {
- func.func @reduce_dispatch_0() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %0 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128xf32>
- memref.assume_alignment %0, 64 : memref<128xf32>
- %1 = gpu.thread_id x
- %2 = arith.cmpi ult, %1, %c1 : index
+ // WARP-EXECUTE-DAG: %[[C0:.*]] = arith.constant 0 : index
+ // WARP-EXECUTE-DAG: %[[C32:.*]] = arith.constant 32 : index
+ // WARP-EXECUTE: %[[TIDX:.*]] = gpu.thread_id x
+ // WARP-EXECUTE: %[[COND32:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index
+ // Single-warp guard filters out threads 32-63.
+ // WARP-EXECUTE: scf.if %[[COND32]] {
+ // WARP-EXECUTE: vector.warp_execute_on_lane_0(%[[TIDX]])[32] {
+ // WARP-EXECUTE: %[[V:.*]] = "some_def"() : () -> vector<128xf32>
+ // WARP-EXECUTE: vector.transfer_write %[[V]], %{{.*}} {in_bounds = [true]} : vector<128xf32>, memref<128xf32>
- // WARP-EXECUTE-DAG: %[[C0:.*]] = arith.constant 0 : index
- // WARP-EXECUTE-DAG: %[[C32:.*]] = arith.constant 32 : index
- // WARP-EXECUTE: %[[TIDX:.*]] = gpu.thread_id x
- // WARP-EXECUTE: %[[COND32:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index
- // Single-warp guard filters out threads 32-63.
- // WARP-EXECUTE: scf.if %[[COND32]] {
- // WARP-EXECUTE: vector.warp_execute_on_lane_0(%[[TIDX]])[32] {
- // WARP-EXECUTE: %[[V:.*]] = "some_def"() : () -> vector<128xf32>
- // WARP-EXECUTE: vector.transfer_write %[[V]], %{{.*}} {in_bounds = [true]} : vector<128xf32>, memref<128xf32>
-
- // CHECK-DAG: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 4)>
- // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
- // CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
- // CHECK: %[[TIDX:.*]] = gpu.thread_id x
- // CHECK: %[[COND32:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index
- // Single-warp guard filters out threads 32-63.
- // CHECK: scf.if %[[COND32]] {
- // CHECK: %[[COND1:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
- // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<128xf32, #gpu.address_space<workgroup>>
- // Single-thread guard runs on thread 0 only.
- // CHECK: scf.if %[[COND1]] {
- // CHECK: %[[V:.*]] = "some_def"() : () -> vector<128xf32>
- // CHECK: vector.transfer_write %[[V]], %{{.*}} : vector<128xf32>, memref<128xf32, #gpu.address_space<workgroup>>
- // CHECK: %[[IDX:.*]] = affine.apply #[[MAP]]()[%[[TIDX]]]
- // CHECK: %[[LOADED:.*]] = vector.transfer_read %{{.*}}[%[[IDX]]], %{{.*}} {in_bounds = [true]} : memref<128xf32, #gpu.address_space<workgroup>>, vector<4xf32>
- // CHECK: vector.transfer_write %[[LOADED]], %{{.*}} {in_bounds = [true]} : vector<4xf32>, memref<128xf32>
- scf.if %2 {
- %v = "some_def"() : () -> (vector<128xf32>)
- vector.transfer_write %v, %0[%c0] : vector<128xf32>, memref<128xf32>
- }
- return
- }
+ // CHECK-DAG: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 4)>
+ // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
+ // CHECK: %[[TIDX:.*]] = gpu.thread_id x
+ // CHECK: %[[COND32:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index
+ // Single-warp guard filters out threads 32-63.
+ // CHECK: scf.if %[[COND32]] {
+ // CHECK: %[[COND1:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
+ // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<128xf32, #gpu.address_space<workgroup>>
+ // Single-thread guard runs on thread 0 only.
+ // CHECK: scf.if %[[COND1]] {
+ // CHECK: %[[V:.*]] = "some_def"() : () -> vector<128xf32>
+ // CHECK: vector.transfer_write %[[V]], %{{.*}} : vector<128xf32>, memref<128xf32, #gpu.address_space<workgroup>>
+ // CHECK: %[[IDX:.*]] = affine.apply #[[MAP]]()[%[[TIDX]]]
+ // CHECK: %[[LOADED:.*]] = vector.transfer_read %{{.*}}[%[[IDX]]], %{{.*}} {in_bounds = [true]} : memref<128xf32, #gpu.address_space<workgroup>>, vector<4xf32>
+ // CHECK: vector.transfer_write %[[LOADED]], %{{.*}} {in_bounds = [true]} : vector<4xf32>, memref<128xf32>
+ scf.if %2 {
+ %v = "some_def"() : () -> (vector<128xf32>)
+ vector.transfer_write %v, %0[%c0] : vector<128xf32>, memref<128xf32>
}
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir
index bc92986..9017dd2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir
@@ -1,69 +1,52 @@
-// RUN: iree-opt %s --pass-pipeline="builtin.module(hal.executable(iree-transform-dialect-interpreter,transform-dialect-drop-schedule))" | FileCheck %s
+// RUN: iree-opt %s --pass-pipeline="builtin.module(iree-codegen-lower-executable-using-transform-dialect)" | FileCheck %s
-// CHECK: #[[$DIV32:.*]] = affine_map<()[s0] -> (s0 floordiv 32)>
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>
-#map = affine_map<()[s0] -> (s0 * 8)>
-#map1 = affine_map<(d0) -> (d0)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<TransformDialectCodegen>
-hal.executable private @distribute {
- hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) {
-// CHECK: hal.executable.export {{.*}} attributes
-// CHECK-SAME: subgroup_size = 32
-// CHECK-SAME: workgroup_size = [256 : index, 1 : index, 1 : index]
- hal.executable.export public @distribute ordinal(0) layout(#pipeline_layout) attributes {translation_info = #translation} {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %c1 = arith.constant 1 : index
- hal.return %arg1, %c1, %c1 : index, index, index
- }
- builtin.module {
+module {
+ func.func @distribute() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb, translation_info = #translation} {
+ %cst = arith.constant dense<0.000000e+00> : vector<1xf16>
+ %c250 = arith.constant 250 : index
+ %c8 = arith.constant 8 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2xf16>
+ memref.assume_alignment %0, 64 : memref<2xf16>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %subview = memref.subview %0[%workgroup_id_x] [1] [1] : memref<2xf16> to memref<1xf16, strided<[1], offset: ?>>
+ scf.forall (%arg0) in (%c250) {
+ vector.transfer_write %cst, %subview[%arg0] {in_bounds = [true]} : vector<1xf16>, memref<1xf16, strided<[1], offset: ?>>
+ } {mapping = [#gpu.thread<x>]}
+ scf.forall (%arg0) in (%c8) {
+ vector.transfer_write %cst, %subview[%arg0] {in_bounds = [true]} : vector<1xf16>, memref<1xf16, strided<[1], offset: ?>>
+ } {mapping = [#gpu.warp<x>]}
+ return
+ }
+ builtin.module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
+ %17 = transform.structured.match ops{["func.func"]} in %variant_op
+ : (!transform.any_op) -> !transform.any_op
+ transform.iree.map_nested_forall_to_gpu_threads %17
+ workgroup_dims = [256, 1, 1] subgroup_size = 32 : (!transform.any_op) -> ()
-// CHECK-LABEL: func.func @distribute
- func.func @distribute() {
- %cst_0 = arith.constant dense<0.000000e+00> : vector<1xf16>
- %c250 = arith.constant 250 : index
- %c8 = arith.constant 8 : index
- %c0 = arith.constant 0 : index
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2xf16>
- memref.assume_alignment %1, 64 : memref<2xf16>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %subview = memref.subview %1[%workgroup_id_x] [1] [1] : memref<2xf16> to memref<1xf16, strided<[1], offset: ?>>
+ // Late canonicalizations to cleanup and pass the checks.
+ // Needs to occur on the whole variant to perform cse on the workgroup_count region
+ %func_op = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_op {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.iree.apply_licm %func_op : !transform.any_op
+ transform.apply_cse to %func_op : !transform.any_op
+ transform.yield
+ } // @__transform_main
+ } // module
+}
+// CHECK-DAG: #[[DIV32:.*]] = affine_map<()[s0] -> (s0 floordiv 32)>
+// CHECK-DAG: #[[TRANSLATION_INFO:.*]] = #iree_codegen.translation_info<None workgroup_size = [256, 1, 1] subgroup_size = 32>
+// CHECK: func.func @distribute()
+// CHECK-SAME: translation_info = #[[TRANSLATION_INFO]]
// CHECK: %[[TX:.+]] = gpu.thread_id x
// CHECK: %[[COND:.*]] = arith.cmpi ult
// CHECK: scf.if %[[COND]] {
// CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[TX]]] {in_bounds = [true]} : vector<1xf16>, memref<1xf16, strided<[1], offset: ?>>
- scf.forall (%arg0) in (%c250) {
- vector.transfer_write %cst_0, %subview[%arg0]
- {in_bounds = [true]} : vector<1xf16>, memref<1xf16, strided<[1], offset: ?>>
- } {mapping = [#gpu.thread<x>]}
-
-// CHECK: %[[WX:.+]] = affine.apply #[[$DIV32]]()[%[[TX]]]
+// CHECK: %[[WX:.+]] = affine.apply #[[DIV32]]()[%[[TX]]]
// CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[WX]]] {in_bounds = [true]} : vector<1xf16>, memref<1xf16, strided<[1], offset: ?>>
- scf.forall (%arg0) in (%c8) {
- vector.transfer_write %cst_0, %subview[%arg0]
- {in_bounds = [true]} : vector<1xf16>, memref<1xf16, strided<[1], offset: ?>>
- } {mapping = [#gpu.warp<x>]}
- return
- }
- builtin.module attributes { transform.with_named_sequence } {
- transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
- %17 = transform.structured.match ops{["func.func"]} in %variant_op
- : (!transform.any_op) -> !transform.any_op
- transform.iree.map_nested_forall_to_gpu_threads %17
- workgroup_dims = [256, 1, 1] subgroup_size = 32 : (!transform.any_op) -> ()
-
- // Late canonicalizations to cleanup and pass the checks.
- // Needs to occur on the whole variant to perform cse on the workgroup_count region
- %func_op = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_op {
- transform.apply_patterns.canonicalization
- } : !transform.any_op
- transform.iree.apply_licm %func_op : !transform.any_op
- transform.apply_cse to %func_op : !transform.any_op
- transform.yield
- } // @__transform_main
- } // module
- }
- }
-}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir
index a856da6..452edb3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target, fold-memref-alias-ops, canonicalize, cse)))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target, fold-memref-alias-ops, canonicalize, cse)))))" %s | FileCheck %s
#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir
index 809022f..77e21f4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir
@@ -1,215 +1,180 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s
-hal.executable @argmax_1d_f16i64 {
-hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100", ukernels = "argmax"}>) {
- hal.executable.export public @argmax_1d_f16i64 ordinal(0) layout(#hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @argmax_1d_f16i64() {
- %c32_i64 = arith.constant 32 : i64
- %cst = arith.constant 0xFC00 : f16
- %c0_i64 = arith.constant 0 : i64
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = arith.extui %0 : i32 to i64
- %3 = arith.extui %1 : i32 to i64
- %4 = arith.shli %3, %c32_i64 : i64
- %5 = arith.ori %2, %4 : i64
- %6 = arith.index_castui %5 : i64 to index
- %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<i64>>
- %8 = flow.dispatch.workload.ordinal %6, 0 : index
- %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?xf16>>{%8}
- %10 = flow.dispatch.tensor.load %9, offsets = [0], sizes = [%8], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf16>>{%8} -> tensor<?xf16>
- %11 = tensor.empty() : tensor<i64>
- %12 = tensor.empty() : tensor<f16>
- %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor<i64>) -> tensor<i64>
- %14 = linalg.fill ins(%cst : f16) outs(%12 : tensor<f16>) -> tensor<f16>
- %15:2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%10 : tensor<?xf16>) outs(%14, %13 : tensor<f16>, tensor<i64>) {
- ^bb0(%in: f16, %out: f16, %out_0: i64):
- %16 = linalg.index 0 : index
- %17 = arith.index_cast %16 : index to i64
- %18 = arith.maximumf %in, %out : f16
- %19 = arith.cmpf ogt, %in, %out : f16
- %20 = arith.select %19, %17, %out_0 : i64
- linalg.yield %18, %20 : f16, i64
- } -> (tensor<f16>, tensor<i64>)
- flow.dispatch.tensor.store %15#1, %7, offsets = [], sizes = [], strides = [] : tensor<i64> -> !flow.dispatch.tensor<writeonly:tensor<i64>>
- return
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100", ukernels = "argmax"}>
+#map = affine_map<(d0) -> (d0)>
+#map1 = affine_map<(d0) -> ()>
+module {
+ func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c32_i64 = arith.constant 32 : i64
+ %cst = arith.constant 0xFC00 : f16
+ %c0_i64 = arith.constant 0 : i64
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.extui %0 : i32 to i64
+ %3 = arith.extui %1 : i32 to i64
+ %4 = arith.shli %3, %c32_i64 : i64
+ %5 = arith.ori %2, %4 : i64
+ %6 = arith.index_castui %5 : i64 to index
+ %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<i64>>
+ %8 = flow.dispatch.workload.ordinal %6, 0 : index
+ %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?xf16>>{%8}
+ %10 = flow.dispatch.tensor.load %9, offsets = [0], sizes = [%8], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf16>>{%8} -> tensor<?xf16>
+ %11 = tensor.empty() : tensor<i64>
+ %12 = tensor.empty() : tensor<f16>
+ %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor<i64>) -> tensor<i64>
+ %14 = linalg.fill ins(%cst : f16) outs(%12 : tensor<f16>) -> tensor<f16>
+ %15:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["reduction"]} ins(%10 : tensor<?xf16>) outs(%14, %13 : tensor<f16>, tensor<i64>) {
+ ^bb0(%in: f16, %out: f16, %out_0: i64):
+ %16 = linalg.index 0 : index
+ %17 = arith.index_cast %16 : index to i64
+ %18 = arith.maximumf %in, %out : f16
+ %19 = arith.cmpf ogt, %in, %out : f16
+ %20 = arith.select %19, %17, %out_0 : i64
+ linalg.yield %18, %20 : f16, i64
+ } -> (tensor<f16>, tensor<i64>)
+ flow.dispatch.tensor.store %15#1, %7, offsets = [], sizes = [], strides = [] : tensor<i64> -> !flow.dispatch.tensor<writeonly:tensor<i64>>
+ return
}
}
-}
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDefault>
-// CHECK-LABEL: hal.executable.export public @argmax_1d_f16i64
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 1 : index, 1 : index]
-//CHECK-LABEL: func.func @argmax_1d_f16i64
-// CHECK: iree_codegen.ukernel.generic "__iree_uk_rocm_argmax_F16I64"
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDefault workgroup_size = [32, 1, 1]>
+// CHECK: func.func @argmax_1d_f16i64()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
+// CHECK: iree_codegen.ukernel.generic "__iree_uk_rocm_argmax_F16I64"
// -----
-
-hal.executable @argmax_2d_f32i64 {
-hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100", ukernels = "argmax"}>) {
- hal.executable.export public @argmax_2d_f32i64 ordinal(0) layout(#hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @argmax_2d_f32i64() {
- %c32_i64 = arith.constant 32 : i64
- %cst = arith.constant 0xFF800000 : f32
- %c0_i64 = arith.constant 0 : i64
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = arith.extui %0 : i32 to i64
- %3 = arith.extui %1 : i32 to i64
- %4 = arith.shli %3, %c32_i64 : i64
- %5 = arith.ori %2, %4 : i64
- %6 = arith.index_castui %5 : i64 to index
- %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16xi64>>
- %8 = flow.dispatch.workload.ordinal %6, 0 : index
- %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x?xf32>>{%8}
- %10 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [16, %8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<16x?xf32>>{%8} -> tensor<16x?xf32>
- %11 = tensor.empty() : tensor<16xi64>
- %12 = tensor.empty() : tensor<16xf32>
- %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor<16xi64>) -> tensor<16xi64>
- %14 = linalg.fill ins(%cst : f32) outs(%12 : tensor<16xf32>) -> tensor<16xf32>
- %15:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%10 : tensor<16x?xf32>) outs(%14, %13 : tensor<16xf32>, tensor<16xi64>) {
- ^bb0(%in: f32, %out: f32, %out_0: i64):
- %16 = linalg.index 1 : index
- %17 = arith.index_cast %16 : index to i64
- %18 = arith.maximumf %in, %out : f32
- %19 = arith.cmpf ogt, %in, %out : f32
- %20 = arith.select %19, %17, %out_0 : i64
- linalg.yield %18, %20 : f32, i64
- } -> (tensor<16xf32>, tensor<16xi64>)
- flow.dispatch.tensor.store %15#1, %7, offsets = [0], sizes = [16], strides = [1] : tensor<16xi64> -> !flow.dispatch.tensor<writeonly:tensor<16xi64>>
- return
- }
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100", ukernels = "argmax"}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+module {
+ func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c32_i64 = arith.constant 32 : i64
+ %cst = arith.constant 0xFF800000 : f32
+ %c0_i64 = arith.constant 0 : i64
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.extui %0 : i32 to i64
+ %3 = arith.extui %1 : i32 to i64
+ %4 = arith.shli %3, %c32_i64 : i64
+ %5 = arith.ori %2, %4 : i64
+ %6 = arith.index_castui %5 : i64 to index
+ %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16xi64>>
+ %8 = flow.dispatch.workload.ordinal %6, 0 : index
+ %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x?xf32>>{%8}
+ %10 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [16, %8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<16x?xf32>>{%8} -> tensor<16x?xf32>
+ %11 = tensor.empty() : tensor<16xi64>
+ %12 = tensor.empty() : tensor<16xf32>
+ %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor<16xi64>) -> tensor<16xi64>
+ %14 = linalg.fill ins(%cst : f32) outs(%12 : tensor<16xf32>) -> tensor<16xf32>
+ %15:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "reduction"]} ins(%10 : tensor<16x?xf32>) outs(%14, %13 : tensor<16xf32>, tensor<16xi64>) {
+ ^bb0(%in: f32, %out: f32, %out_0: i64):
+ %16 = linalg.index 1 : index
+ %17 = arith.index_cast %16 : index to i64
+ %18 = arith.maximumf %in, %out : f32
+ %19 = arith.cmpf ogt, %in, %out : f32
+ %20 = arith.select %19, %17, %out_0 : i64
+ linalg.yield %18, %20 : f32, i64
+ } -> (tensor<16xf32>, tensor<16xi64>)
+ flow.dispatch.tensor.store %15#1, %7, offsets = [0], sizes = [16], strides = [1] : tensor<16xi64> -> !flow.dispatch.tensor<writeonly:tensor<16xi64>>
+ return
}
}
-}
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDefault>
-// CHECK-LABEL: hal.executable.export public @argmax_2d_f32i64
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 1 : index, 1 : index]
-//CHECK-LABEL: func.func @argmax_2d_f32i64
-// CHECK-DAG: %[[SUBVIEW:.*]] = memref.subview{{.*}} memref<16x?xf32
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDefault workgroup_size = [32, 1, 1]>
+// CHECK: func.func @argmax_2d_f32i64
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
+// CHECK: %[[SUBVIEW:.*]] = memref.subview{{.*}} memref<16x?xf32
// CHECK-SAME: to memref<1x?xf32
-// CHECK: iree_codegen.ukernel.generic "__iree_uk_rocm_argmax_F32I64" ins(%[[SUBVIEW]]
+// CHECK: iree_codegen.ukernel.generic "__iree_uk_rocm_argmax_F32I64" ins(%[[SUBVIEW]]
// -----
-// When the ukernel attribute is not set, we do not go through ukernel pipeline.
-hal.executable @no_ukernel_argmax_1d_f16i64 {
-hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100"}>) {
- hal.executable.export public @no_ukernel_argmax_1d_f16i64 ordinal(0) layout(#hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1
- hal.return %x, %y, %z : index, index, index
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100"}>
+#map = affine_map<(d0) -> (d0)>
+#map1 = affine_map<(d0) -> ()>
+module {
+ func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c32_i64 = arith.constant 32 : i64
+ %cst = arith.constant 0xFC00 : f16
+ %c0_i64 = arith.constant 0 : i64
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.extui %0 : i32 to i64
+ %3 = arith.extui %1 : i32 to i64
+ %4 = arith.shli %3, %c32_i64 : i64
+ %5 = arith.ori %2, %4 : i64
+ %6 = arith.index_castui %5 : i64 to index
+ %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<i64>>
+ %8 = flow.dispatch.workload.ordinal %6, 0 : index
+ %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?xf16>>{%8}
+ %10 = flow.dispatch.tensor.load %9, offsets = [0], sizes = [%8], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf16>>{%8} -> tensor<?xf16>
+ %11 = tensor.empty() : tensor<i64>
+ %12 = tensor.empty() : tensor<f16>
+ %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor<i64>) -> tensor<i64>
+ %14 = linalg.fill ins(%cst : f16) outs(%12 : tensor<f16>) -> tensor<f16>
+ %15:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["reduction"]} ins(%10 : tensor<?xf16>) outs(%14, %13 : tensor<f16>, tensor<i64>) {
+ ^bb0(%in: f16, %out: f16, %out_0: i64):
+ %16 = linalg.index 0 : index
+ %17 = arith.index_cast %16 : index to i64
+ %18 = arith.maximumf %in, %out : f16
+ %19 = arith.cmpf ogt, %in, %out : f16
+ %20 = arith.select %19, %17, %out_0 : i64
+ linalg.yield %18, %20 : f16, i64
+ } -> (tensor<f16>, tensor<i64>)
+ flow.dispatch.tensor.store %15#1, %7, offsets = [], sizes = [], strides = [] : tensor<i64> -> !flow.dispatch.tensor<writeonly:tensor<i64>>
+ return
}
- builtin.module {
- func.func @no_ukernel_argmax_1d_f16i64() {
- %c32_i64 = arith.constant 32 : i64
- %cst = arith.constant 0xFC00 : f16
- %c0_i64 = arith.constant 0 : i64
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = arith.extui %0 : i32 to i64
- %3 = arith.extui %1 : i32 to i64
- %4 = arith.shli %3, %c32_i64 : i64
- %5 = arith.ori %2, %4 : i64
- %6 = arith.index_castui %5 : i64 to index
- %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<i64>>
- %8 = flow.dispatch.workload.ordinal %6, 0 : index
- %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?xf16>>{%8}
- %10 = flow.dispatch.tensor.load %9, offsets = [0], sizes = [%8], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf16>>{%8} -> tensor<?xf16>
- %11 = tensor.empty() : tensor<i64>
- %12 = tensor.empty() : tensor<f16>
- %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor<i64>) -> tensor<i64>
- %14 = linalg.fill ins(%cst : f16) outs(%12 : tensor<f16>) -> tensor<f16>
- %15:2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%10 : tensor<?xf16>) outs(%14, %13 : tensor<f16>, tensor<i64>) {
- ^bb0(%in: f16, %out: f16, %out_0: i64):
- %16 = linalg.index 0 : index
- %17 = arith.index_cast %16 : index to i64
- %18 = arith.maximumf %in, %out : f16
- %19 = arith.cmpf ogt, %in, %out : f16
- %20 = arith.select %19, %17, %out_0 : i64
- linalg.yield %18, %20 : f16, i64
- } -> (tensor<f16>, tensor<i64>)
- flow.dispatch.tensor.store %15#1, %7, offsets = [], sizes = [], strides = [] : tensor<i64> -> !flow.dispatch.tensor<writeonly:tensor<i64>>
- return
- }
- }
-}
}
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDistribute>
-// CHECK-LABEL: hal.executable.export public @no_ukernel_argmax_1d_f16i64
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [1 : index, 1 : index, 1 : index]
-//CHECK-LABEL: func.func @no_ukernel_argmax_1d_f16i64
-// CHECK-NOT: iree_codegen.ukernel.generic
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDistribute workgroup_size = [1, 1, 1]>
+// CHECK: func.func @no_ukernel_argmax_1d_f16i64()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
+// CHECK-NOT: iree_codegen.ukernel.generic
// -----
-// Currently we do only handle -Inf case as initial values.
-hal.executable @not_neg_inf_init_argmax_1d {
-hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100", ukernels = "argmax"}>) {
- hal.executable.export public @not_neg_inf_init_argmax_1d ordinal(0) layout(#hal.pipeline.layout<push_constants = 2, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1
- hal.return %x, %y, %z : index, index, index
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {target_arch = "gfx1100", ukernels = "argmax"}>
+#map = affine_map<(d0) -> (d0)>
+#map1 = affine_map<(d0) -> ()>
+module {
+ func.func @not_neg_inf_init_argmax_1d() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+ %c32_i64 = arith.constant 32 : i64
+ %cst = arith.constant 0.000000e+00 : f16
+ %c0_i64 = arith.constant 0 : i64
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.extui %0 : i32 to i64
+ %3 = arith.extui %1 : i32 to i64
+ %4 = arith.shli %3, %c32_i64 : i64
+ %5 = arith.ori %2, %4 : i64
+ %6 = arith.index_castui %5 : i64 to index
+ %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<i64>>
+ %8 = flow.dispatch.workload.ordinal %6, 0 : index
+ %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?xf16>>{%8}
+ %10 = flow.dispatch.tensor.load %9, offsets = [0], sizes = [%8], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf16>>{%8} -> tensor<?xf16>
+ %11 = tensor.empty() : tensor<i64>
+ %12 = tensor.empty() : tensor<f16>
+ %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor<i64>) -> tensor<i64>
+ %14 = linalg.fill ins(%cst : f16) outs(%12 : tensor<f16>) -> tensor<f16>
+ %15:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["reduction"]} ins(%10 : tensor<?xf16>) outs(%14, %13 : tensor<f16>, tensor<i64>) {
+ ^bb0(%in: f16, %out: f16, %out_0: i64):
+ %16 = linalg.index 0 : index
+ %17 = arith.index_cast %16 : index to i64
+ %18 = arith.maximumf %in, %out : f16
+ %19 = arith.cmpf ogt, %in, %out : f16
+ %20 = arith.select %19, %17, %out_0 : i64
+ linalg.yield %18, %20 : f16, i64
+ } -> (tensor<f16>, tensor<i64>)
+ flow.dispatch.tensor.store %15#1, %7, offsets = [], sizes = [], strides = [] : tensor<i64> -> !flow.dispatch.tensor<writeonly:tensor<i64>>
+ return
}
- builtin.module {
- func.func @not_neg_inf_init_argmax_1d() {
- %c32_i64 = arith.constant 32 : i64
- %cst = arith.constant 0.0 : f16
- %c0_i64 = arith.constant 0 : i64
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = arith.extui %0 : i32 to i64
- %3 = arith.extui %1 : i32 to i64
- %4 = arith.shli %3, %c32_i64 : i64
- %5 = arith.ori %2, %4 : i64
- %6 = arith.index_castui %5 : i64 to index
- %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<i64>>
- %8 = flow.dispatch.workload.ordinal %6, 0 : index
- %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?xf16>>{%8}
- %10 = flow.dispatch.tensor.load %9, offsets = [0], sizes = [%8], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf16>>{%8} -> tensor<?xf16>
- %11 = tensor.empty() : tensor<i64>
- %12 = tensor.empty() : tensor<f16>
- %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor<i64>) -> tensor<i64>
- %14 = linalg.fill ins(%cst : f16) outs(%12 : tensor<f16>) -> tensor<f16>
- %15:2 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>, affine_map<(d0) -> ()>], iterator_types = ["reduction"]} ins(%10 : tensor<?xf16>) outs(%14, %13 : tensor<f16>, tensor<i64>) {
- ^bb0(%in: f16, %out: f16, %out_0: i64):
- %16 = linalg.index 0 : index
- %17 = arith.index_cast %16 : index to i64
- %18 = arith.maximumf %in, %out : f16
- %19 = arith.cmpf ogt, %in, %out : f16
- %20 = arith.select %19, %17, %out_0 : i64
- linalg.yield %18, %20 : f16, i64
- } -> (tensor<f16>, tensor<i64>)
- flow.dispatch.tensor.store %15#1, %7, offsets = [], sizes = [], strides = [] : tensor<i64> -> !flow.dispatch.tensor<writeonly:tensor<i64>>
- return
- }
- }
-}
}
-// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDistribute>
-// CHECK-LABEL: hal.executable.export public @not_neg_inf_init_argmax_1d
+// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUDistribute workgroup_size = [1, 1, 1]>
+// CHECK: func.func @not_neg_inf_init_argmax_1d()
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [1 : index, 1 : index, 1 : index]
-//CHECK-LABEL: func.func @not_neg_inf_init_argmax_1d
-// CHECK-NOT: iree_codegen.ukernel.generic
+// CHECK-NOT: iree_codegen.ukernel.generic
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_distribution_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_distribution_pipeline_test.mlir
index 48e4941..e5e0aa3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_distribution_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_distribution_pipeline_test.mlir
@@ -1,34 +1,26 @@
// RUN: iree-opt --split-input-file --iree-codegen-llvmgpu-use-vector-distribution \
-// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target, canonicalize)))' \
+// RUN: --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target, canonicalize))' \
// RUN: %s | FileCheck %s
-hal.executable @fit_shared_memory_schedule {
-hal.executable.variant public @rocm_hsaco_fb
- target(<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>],
- target_arch = "gfx942", ukernels = "none"}>) {
- hal.executable.export public @fit_shared_memory_schedule ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
+#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {mma_intrinsics = [#iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>], target_arch = "gfx942", ukernels = "none"}>
+module {
+ func.func @fit_shared_memory_schedule() {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c129181184 = arith.constant 129181184 : index
+ %c18112 = arith.constant 18112 : index
+ %c100980224 = arith.constant 100980224 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c129181184) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x80x1280xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c18112) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c100980224) : !flow.dispatch.tensor<writeonly:tensor<64x80x1280xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 80, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x80x1280xf16>> -> tensor<64x80x1280xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
+ %5 = tensor.empty() : tensor<64x80x1280xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<64x80x1280xf32>) -> tensor<64x80x1280xf32>
+ %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x80x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x80x1280xf32>) -> tensor<64x80x1280xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 80, 1280], strides = [1, 1, 1] : tensor<64x80x1280xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x80x1280xf32>>
+ return
}
- builtin.module {
- func.func @fit_shared_memory_schedule() {
- %cst = arith.constant 0.000000e+00 : f32
- %c129181184 = arith.constant 129181184 : index
- %c18112 = arith.constant 18112 : index
- %c100980224 = arith.constant 100980224 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c129181184) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x80x1280xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c18112) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c100980224) : !flow.dispatch.tensor<writeonly:tensor<64x80x1280xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 80, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x80x1280xf16>> -> tensor<64x80x1280xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1280x1280xf16>> -> tensor<64x1280x1280xf16>
- %5 = tensor.empty() : tensor<64x80x1280xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<64x80x1280xf32>) -> tensor<64x80x1280xf32>
- %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x80x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x80x1280xf32>) -> tensor<64x80x1280xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 80, 1280], strides = [1, 1, 1] : tensor<64x80x1280xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x80x1280xf32>>
- return
- }
- }
-}}
+}
-// CHECK-LABEL: .executable.export public @fit_shared_memory_schedule
+
+// CHECK-LABEL: func.func @fit_shared_memory_schedule()
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir
index c93c4aa..0612751 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy, iree-llvmgpu-lower-executable-target)))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s
hal.executable private @forward_dispatch_116 {
hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {target_arch = "sm_80"}>) {
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/BUILD.bazel b/compiler/src/iree/compiler/Codegen/SPIRV/BUILD.bazel
index 30c56a4..66b44ab 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/BUILD.bazel
@@ -65,6 +65,7 @@
"SPIRVInitialVectorLowering.cpp",
"SPIRVLinkExecutables.cpp",
"SPIRVLowerExecutableTargetPass.cpp",
+ "SPIRVLowerExecutableUsingTransformDialect.cpp",
"SPIRVMapMemRefStorageClass.cpp",
"SPIRVMaterializeExecutableConditions.cpp",
"SPIRVSelectLoweringStrategy.cpp",
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/SPIRV/CMakeLists.txt
index 7c608f1..b1040c2 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/CMakeLists.txt
@@ -64,6 +64,7 @@
"SPIRVInitialVectorLowering.cpp"
"SPIRVLinkExecutables.cpp"
"SPIRVLowerExecutableTargetPass.cpp"
+ "SPIRVLowerExecutableUsingTransformDialect.cpp"
"SPIRVMapMemRefStorageClass.cpp"
"SPIRVMaterializeExecutableConditions.cpp"
"SPIRVSelectLoweringStrategy.cpp"
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp
index 4d3e61f..524ec19 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp
@@ -514,32 +514,32 @@
useIndirectBindings = true;
};
- llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps =
- getAllEntryPoints(moduleOp);
for (auto funcOp : moduleOp.getOps<mlir::FunctionOpInterface>()) {
- auto exportOp = exportOps.lookup(funcOp.getName());
+ auto exportOp = getEntryPoint(funcOp);
if (!exportOp)
continue;
- // TODO(ravishankarm): This needs to be removed after ConvertToGPU is
- // deprecated. All passes must set the `workgroup_size` on the
- // `hal.executable.export` directly and not on the function.
if (funcOp->hasAttr(spirv::getEntryPointABIAttrName()))
continue;
- SmallVector<int64_t> workgroupSize = getWorkgroupSize(exportOp);
- if (workgroupSize.empty()) {
- exportOp.emitOpError(
+ std::optional<ArrayAttr> workgroupSize = exportOp->getWorkgroupSize();
+ if (!workgroupSize) {
+ exportOp->emitOpError(
"expected workgroup_size attribute to be set for SPIR-V lowering");
return signalPassFailure();
}
- std::optional<int64_t> subgroupSize = getSubgroupSize(exportOp);
- auto workgroupSize32 = llvm::map_to_vector(
- workgroupSize, [](int64_t v) { return static_cast<int32_t>(v); });
+ auto workgroupSize32 =
+ llvm::map_to_vector(workgroupSize.value(), [](Attribute v) {
+ return static_cast<int32_t>(
+ cast<IntegerAttr>(v).getValue().getZExtValue());
+ });
+
+ std::optional<APInt> subgroupSize = exportOp->getSubgroupSize();
std::optional<int> subgroupSize32;
- if (subgroupSize)
- subgroupSize32 = *subgroupSize;
+ if (subgroupSize && subgroupSize->isNonNegative()) {
+ subgroupSize32 = subgroupSize->getZExtValue();
+ }
for (IREE::HAL::DescriptorSetLayoutAttr setLayout :
- exportOp.getLayout().getSetLayouts()) {
+ exportOp->getLayout().getSetLayouts()) {
bool isIndirect =
setLayout.getFlags() == IREE::HAL::DescriptorSetLayoutFlags::Indirect;
if (isIndirect != useIndirectBindings) {
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
index dd55e72..1642662 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
@@ -1685,27 +1685,7 @@
//===----------------------------------------------------------------------===//
static LogicalResult setConfigForKernel(const spirv::TargetEnv &targetEnv,
- IREE::HAL::ExecutableExportOp exportOp,
mlir::FunctionOpInterface funcOp) {
- if (!getTranslationInfo(funcOp)) {
- // If no translation info set, first check whether we already have workgroup
- // count set--it's a "contract" to indicate that we should bypass all tiling
- // and distribution to go down just the most basic lowering flow.
- if (Block *body = exportOp.getWorkgroupCountBody()) {
- auto retOp = cast<IREE::HAL::ReturnOp>(body->getTerminator());
- // For scalar dispatch cases--using just one thread of one workgroup.
- auto isOne = [](Value value) { return matchPattern(value, m_One()); };
- if (llvm::all_of(retOp.getOperands(), isOne)) {
- std::array<int64_t, 3> workgroupSize = {1, 1, 1};
- if (failed(setDispatchConfig(funcOp, workgroupSize, std::nullopt)))
- return failure();
- auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
- funcOp.getContext(), CodeGenPipeline::SPIRVBaseLowering);
- return setTranslationInfo(funcOp, translationInfo);
- }
- }
- }
-
SmallVector<Operation *> computeOps = getComputeOps(funcOp);
if (computeOps.empty()) {
// No compute operations found. Allow to pass through without a config.
@@ -1742,29 +1722,40 @@
"loop body is expected to be set as root");
}
-LogicalResult initSPIRVLaunchConfig(ModuleOp module) {
- llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps =
- getAllEntryPoints(module);
- spirv::TargetEnvAttr targetEnvAttr = getSPIRVTargetEnvAttr(module);
+LogicalResult initSPIRVLaunchConfig(FunctionOpInterface funcOp) {
+ spirv::TargetEnvAttr targetEnvAttr = getSPIRVTargetEnvAttr(funcOp);
if (!targetEnvAttr) {
- return module.emitOpError(
+ return funcOp.emitOpError(
"expected parent hal.executable.variant to have spirv.target_env "
"attribute");
}
- spirv::TargetEnv targetEnv(targetEnvAttr);
+ if (getTranslationInfo(funcOp)) {
+ return success();
+ }
- for (auto funcOp : module.getOps<mlir::FunctionOpInterface>()) {
- auto exportOp = exportOps.lookup(funcOp.getName());
- if (!exportOp)
- continue;
- if (getTranslationInfo(exportOp))
- continue;
-
- if (failed(setConfigForKernel(targetEnv, exportOp, funcOp))) {
- return failure();
+ if (auto exportOp = getEntryPoint(funcOp)) {
+ // If no translation info set, first check whether we already have workgroup
+ // count set--it's a "contract" to indicate that we should bypass all tiling
+ // and distribution to go down just the most basic lowering flow.
+ if (Block *body = exportOp->getWorkgroupCountBody()) {
+ auto retOp = cast<IREE::HAL::ReturnOp>(body->getTerminator());
+ // For scalar dispatch cases--using just one thread of one workgroup.
+ auto isOne = [](Value value) { return matchPattern(value, m_One()); };
+ if (llvm::all_of(retOp.getOperands(), isOne)) {
+ std::array<int64_t, 3> workgroupSize = {1, 1, 1};
+ auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
+ funcOp.getContext(), CodeGenPipeline::SPIRVBaseLowering,
+ workgroupSize);
+ return setTranslationInfo(funcOp, translationInfo);
+ }
}
}
+ spirv::TargetEnv targetEnv(targetEnvAttr);
+ if (failed(setConfigForKernel(targetEnv, funcOp))) {
+ return failure();
+ }
+
return success();
}
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.h b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.h
index c68ba61..e831200 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.h
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.h
@@ -20,6 +20,7 @@
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
namespace mlir::iree_compiler {
@@ -115,7 +116,7 @@
/// Attaches the `translation_info` attribute to entry points in `moduleOp` and
/// `lowering_config` attributes to all root ops in `moduleOp`'s region.
/// These attributes are used to drive the CodeGen pipeline.
-LogicalResult initSPIRVLaunchConfig(ModuleOp moduleOp);
+LogicalResult initSPIRVLaunchConfig(FunctionOpInterface funcOp);
} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
index 28894a5..80287e4 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
@@ -21,6 +21,7 @@
#include "iree/compiler/Codegen/SPIRV/Utils.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
+#include "iree/compiler/Utils/PassUtils.h"
#include "llvm/Support/Debug.h"
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
#include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"
@@ -104,83 +105,81 @@
//===----------------------------------------------------------------------===//
static void addTileAndDistributeToWorkgroupsPasses(
- OpPassManager &passManager, bool useFuseTensorPadWithConsumerPass = false,
+ OpPassManager &funcPassManager,
+ bool useFuseTensorPadWithConsumerPass = false,
bool useWARForCooperativeMatrixCodegen = false) {
- passManager.addPass(createTileAndDistributeToWorkgroupsPass(
+ funcPassManager.addPass(createTileAndDistributeToWorkgroupsPass(
kNumMaxParallelDims,
linalg::DistributionMethod::CyclicNumProcsEqNumIters));
- auto &nestedModulePM = passManager.nest<ModuleOp>();
if (useFuseTensorPadWithConsumerPass) {
- nestedModulePM.addNestedPass<func::FuncOp>(
- createFuseTensorPadWithConsumerPass());
+ funcPassManager.addPass(createFuseTensorPadWithConsumerPass());
}
- nestedModulePM.addNestedPass<func::FuncOp>(
- createConvertToDestinationPassingStylePass(
- useWARForCooperativeMatrixCodegen));
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createConvertToDestinationPassingStylePass(
+ useWARForCooperativeMatrixCodegen));
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
/// Adds passes to lower vector ops to meet SPIR-V requirements.
-static void addSPIRVVectorLoweringPasses(OpPassManager &modulePM) {
- modulePM.addNestedPass<func::FuncOp>(createSPIRVInitialVectorLoweringPass());
- modulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
- modulePM.addNestedPass<func::FuncOp>(createSPIRVFinalVectorLoweringPass());
+void addSPIRVVectorLoweringPasses(OpPassManager &funcPassManager) {
+ funcPassManager.addPass(createSPIRVInitialVectorLoweringPass());
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createSPIRVFinalVectorLoweringPass());
}
-static void addBufferizePasses(OpPassManager &passManager,
+static void addBufferizePasses(OpPassManager &funcPassManager,
BufferizationOptions::AllocationFn fn) {
BufferizationOptions::AllocationFn allocationFn = fn;
BufferizationOptions::MemCpyFn memcpyFn = gpuCopyFn;
- addIREEComprehensiveBufferizePasses(passManager, allocationFn, memcpyFn);
+ addIREEComprehensiveBufferizePasses(funcPassManager, allocationFn, memcpyFn);
}
static void
-addSPIRVBufferizePasses(OpPassManager &passManager,
+addSPIRVBufferizePasses(OpPassManager &funcPassManager,
BufferizationOptions::AllocationFn allocationFn) {
// Resolve dim ops first so that we don't have compute Linalg ops lingering on
// becuase of dim op usage. This avoids bufferizing those compute ops just for
// their shape dimensions.
- passManager.addPass(memref::createResolveShapedTypeResultDimsPass());
- addBufferizePasses(passManager, allocationFn);
+ funcPassManager.addPass(memref::createResolveShapedTypeResultDimsPass());
+ addBufferizePasses(funcPassManager, allocationFn);
// Distribute immediately after bufferization to avoid losing attribute
// annotations in subsequent transformations. This is a bit fragile right now
// but we expect upstream for loops to eventually recognize distribution as a
// first-class attribute then we don't need this.
- passManager.addNestedPass<func::FuncOp>(createSPIRVDistributePass());
- passManager.addPass(memref::createResolveShapedTypeResultDimsPass());
- passManager.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- passManager.addNestedPass<func::FuncOp>(createCSEPass());
- passManager.addNestedPass<func::FuncOp>(createCleanupBufferAllocViewPass());
+ funcPassManager.addPass(createSPIRVDistributePass());
+ funcPassManager.addPass(memref::createResolveShapedTypeResultDimsPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createCleanupBufferAllocViewPass());
}
/// Adds passes to materialize structured ops as loops. This replaces structured
/// ops with loop nests containing payloads, so it should be invoked after
/// tiling and vectorization and before buffer transformations.
-static void addLoopMaterializationPasses(OpPassManager &pm) {
- pm.addNestedPass<func::FuncOp>(IREE::LinalgExt::createLinalgExtToLoopsPass());
- pm.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
- pm.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());
- pm.addNestedPass<func::FuncOp>(createRemoveSingleIterationLoopPass());
- pm.addPass(createCanonicalizerPass());
- pm.addPass(createCSEPass());
+static void addLoopMaterializationPasses(OpPassManager &funcPassManager) {
+ funcPassManager.addPass(IREE::LinalgExt::createLinalgExtToLoopsPass());
+ funcPassManager.addPass(createMemrefCopyToLinalgPass());
+ funcPassManager.addPass(createConvertLinalgToLoopsPass());
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
/// Adds passes to lowering MemRefs. This folds MemRef subviews, flattens n-D
/// MemRef into 1-D ones, vectorizes load/store when possible, and performs
/// cross loop nest optimizations. This should be invoked after structured op
/// lowering and before final SPIR-V conversion.
-static void addMemRefLoweringPasses(OpPassManager &pm) {
- pm.addPass(createCanonicalizerPass());
- pm.addPass(createCSEPass());
+static void addMemRefLoweringPasses(OpPassManager &modulePassManager) {
+ FunctionLikeNest funcPassManager(modulePassManager);
- pm.addNestedPass<func::FuncOp>(createConvertComplexToStandardPass());
+ funcPassManager.addPass(createCanonicalizerPass)
+ .addPass(createCSEPass)
+ .addPass(createConvertComplexToStandardPass)
- // Math dialect elementry functions -> polynomial form.
- pm.addNestedPass<func::FuncOp>(createPolynomialApproximationPass());
+ // Math dialect elementry functions -> polynomial form.
+ .addPass(createPolynomialApproximationPass)
- pm.addNestedPass<func::FuncOp>(createPadDynamicAlloc());
+ .addPass(createPadDynamicAlloc);
// Check to make sure we are not exceeding shared memory usage limit.
auto getSharedMemoryLimit = [](mlir::FunctionOpInterface func) {
@@ -190,308 +189,280 @@
};
// TODO: query this from the target.
auto getIndexBitwidth = [](mlir::FunctionOpInterface) { return 32; };
- pm.addPass(
- createGPUCheckResourceUsagePass(getSharedMemoryLimit, getIndexBitwidth));
+ funcPassManager
+ .addPass([&]() {
+ return createGPUCheckResourceUsagePass(getSharedMemoryLimit,
+ getIndexBitwidth);
+ })
- // Fold load/store from/to subview ops into the original memref when possible.
- // In SPIR-V we don't use memref descriptor so it's not possible to handle
- // subview ops.
- pm.addPass(memref::createFoldMemRefAliasOpsPass());
- pm.addPass(createEmulateNarrowTypePass());
- pm.addNestedPass<func::FuncOp>(memref::createExpandOpsPass());
- pm.addPass(createCanonicalizerPass());
- pm.addPass(createCSEPass());
+ // Fold load/store from/to subview ops into the original memref when
+ // possible. In SPIR-V we don't use memref descriptor so it's not possible
+ // to handle subview ops.
+ .addPass(memref::createFoldMemRefAliasOpsPass)
+ .addPass(createEmulateNarrowTypePass)
+ .addPass(memref::createExpandOpsPass)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass)
- // Turn scalar load/store from memrefs into vectorized ones if possible. This
- // gives better memory access patterns, which is very important for perf.
- pm.addPass(createSPIRVVectorizeLoadStore());
- // Perform optimizations that need to across the scf.for region boundary.
- pm.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
- // Perform various vector-level cross-op optimizations like load-store
- // forwarding, shape casting and casting op cancelling.
- pm.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass(
- /*flatten=*/false, /*dropUnitDims=*/false));
- pm.addNestedPass<func::FuncOp>(createSPIRVBreakDownLargeVectorPass());
+ // Turn scalar load/store from memrefs into vectorized ones if possible.
+ // This gives better memory access patterns, which is very important for
+ // perf.
+ .addPass(createSPIRVVectorizeLoadStore)
+ // Perform optimizations that need to across the scf.for region boundary.
+ .addPass(createForOpCanonicalizationPass)
+ // Perform various vector-level cross-op optimizations like load-store
+ // forwarding, shape casting and casting op cancelling.
+ .addPass([&]() {
+ return createOptimizeVectorTransferPass(
+ /*flatten=*/false, /*dropUnitDims=*/false);
+ })
+ .addPass(createSPIRVBreakDownLargeVectorPass)
- // Perform optimizations that need to across the scf.for region boundary.
- pm.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
- pm.addPass(createCanonicalizerPass());
- pm.addPass(createCSEPass());
- pm.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass(
- /*flatten=*/false, /*dropUnitDims=*/false));
+ // Perform optimizations that need to across the scf.for region boundary.
+ .addPass(createForOpCanonicalizationPass)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass)
+ .addPass([&]() {
+ return createOptimizeVectorTransferPass(
+ /*flatten=*/false, /*dropUnitDims=*/false);
+ });
- // Turn multi-dimension memref into one-dimension. This is needed for SPIR-V
- // because we don't use upstream memref descriptors.
- pm.addPass(createFlattenMemRefSubspanPass());
- pm.addNestedPass<func::FuncOp>(
- createSPIRVEraseStorageBufferStaticShapePass());
+ // Turn multi-dimension memref into one-dimension. This is needed for
+ // SPIR-V because we don't use upstream memref descriptors.
+ modulePassManager.addPass(createFlattenMemRefSubspanPass());
+
+ FunctionLikeNest(modulePassManager)
+ .addPass(createSPIRVEraseStorageBufferStaticShapePass);
}
/// Adds passes to perform the final SPIR-V conversion.
-static void addSPIRVLoweringPasses(OpPassManager &pm) {
- pm.addPass(createCanonicalizerPass());
- pm.addPass(createCSEPass());
+static void addSPIRVLoweringPasses(OpPassManager &modulePassManager) {
+ FunctionLikeNest(modulePassManager)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass)
+ .addPass(createLowerAffinePass)
- pm.addPass(createLowerAffinePass());
+ // Lower ApplyScale before the i64 Emulation Pass so that new 64-bit ops
+ // are also emulated if not supported by the target.
+ .addPass([&]() {
+ return tosa::createTosaToArith(/*includeApplyRescale=*/true,
+ /*use32BitApplyRescale=*/true);
+ })
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass)
+ .addPass(createSPIRVMapMemRefStorageClassPass)
+ .addPass(createSPIRVEmulateI64Pass)
+ .addPass(createConvertBf16ArithToF32Pass)
+ .addPass(createConvertBf16ToUInt16BuffersPass)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass);
- // Lower ApplyScale before the i64 Emulation Pass so that new 64-bit ops are
- // also emulated if not supported by the target.
- pm.addPass(tosa::createTosaToArith(/*includeApplyRescale=*/true,
- /*use32BitApplyRescale=*/true));
- pm.addPass(createCanonicalizerPass());
- pm.addPass(createCSEPass());
-
- pm.addNestedPass<func::FuncOp>(createSPIRVMapMemRefStorageClassPass());
- pm.addPass(createSPIRVEmulateI64Pass());
- pm.addPass(createConvertBf16ArithToF32Pass());
- pm.addPass(createConvertBf16ToUInt16BuffersPass());
- pm.addPass(createCanonicalizerPass());
- pm.addPass(createCSEPass());
-
- pm.addPass(createConvertToSPIRVPass(clSPIRVIndexingBits));
+ modulePassManager.addPass(createConvertToSPIRVPass(clSPIRVIndexingBits));
auto getTargetEnv = [](spirv::ModuleOp moduleOp) {
return getSPIRVTargetEnvAttr(moduleOp);
};
- OpPassManager &spirvPM = pm.nest<spirv::ModuleOp>();
- spirvPM.addPass(spirv::createUnifyAliasedResourcePass(getTargetEnv));
- spirvPM.addPass(spirv::createSPIRVLowerABIAttributesPass());
- spirvPM.addPass(createCanonicalizerPass());
- spirvPM.addPass(createCSEPass());
- spirvPM.addPass(spirv::createSPIRVRewriteInsertsPass());
- spirvPM.addPass(spirv::createSPIRVCanonicalizeGLPass());
- spirvPM.addPass(spirv::createSPIRVUpdateVCEPass());
-}
-
-void addSPIRVTransformDialectPasses(OpPassManager &passManager,
- StringRef entryPoint) {
- passManager.addPass(
- mlir::iree_compiler::createTransformDialectInterpreterPass(entryPoint));
-
- // Dropping the schedule is needed:
- // 1. if we want to embed the transform in the module: we should drop the
- // schedule once applied.
- // 2. if transform.do_not_dce_operands ops are introduced.
- passManager.addPass(createDropSchedulePass());
+ OpPassManager &spirvModulePassManager =
+ modulePassManager.nest<spirv::ModuleOp>();
+ spirvModulePassManager.addPass(
+ spirv::createUnifyAliasedResourcePass(getTargetEnv));
+ spirvModulePassManager.addPass(spirv::createSPIRVLowerABIAttributesPass());
+ spirvModulePassManager.addPass(createCanonicalizerPass());
+ spirvModulePassManager.addPass(createCSEPass());
+ spirvModulePassManager.addPass(spirv::createSPIRVRewriteInsertsPass());
+ spirvModulePassManager.addPass(spirv::createSPIRVCanonicalizeGLPass());
+ spirvModulePassManager.addPass(spirv::createSPIRVUpdateVCEPass());
}
//===----------------------------------------------------------------------===//
// Pass Pipelines
//===----------------------------------------------------------------------===//
-void addSPIRVBaseLoweringPassPipeline(OpPassManager &pm) {
- auto &nestedModulePM = pm.nest<ModuleOp>();
+void addSPIRVBaseLoweringPassPipeline(OpPassManager &funcPassManager) {
+ funcPassManager.addPass(createConvertToDestinationPassingStylePass(
+ /*useWARForCooperativeMatrixCodegen=*/false));
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createConvertToDestinationPassingStylePass(
- /*useWARForCooperativeMatrixCodegen=*/false));
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ addBufferizePasses(funcPassManager, gpuAllocateWorkgroupMemoryFn);
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- addBufferizePasses(nestedModulePM, gpuAllocateWorkgroupMemoryFn);
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
-
- addLoopMaterializationPasses(nestedModulePM);
+ addLoopMaterializationPasses(funcPassManager);
}
-void addSPIRVBaseDistributePassPipeline(OpPassManager &pm) {
- addTileAndDistributeToWorkgroupsPasses(pm);
+void addSPIRVBaseDistributePassPipeline(OpPassManager &funcPassManager) {
+ addTileAndDistributeToWorkgroupsPasses(funcPassManager);
- auto &nestedModulePM = pm.nest<ModuleOp>();
-
- addBufferizePasses(nestedModulePM, gpuAllocateWorkgroupMemoryFn);
+ addBufferizePasses(funcPassManager, gpuAllocateWorkgroupMemoryFn);
// Tile and distribute to GPU invocations.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createSPIRVTileAndDistributePass());
- nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGPUDistributeSharedMemoryCopy());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createSPIRVTileAndDistributePass());
+ funcPassManager.addPass(createMemrefCopyToLinalgPass());
+ funcPassManager.addPass(createGPUDistributeSharedMemoryCopy());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- addLoopMaterializationPasses(nestedModulePM);
+ addLoopMaterializationPasses(funcPassManager);
}
-void addSPIRVBaseVectorizePassPipeline(OpPassManager &pm) {
+void addSPIRVBaseVectorizePassPipeline(OpPassManager &funcPassManager) {
addTileAndDistributeToWorkgroupsPasses(
- pm, /*useFuseTensorPadWithConsumerPass=*/true);
+ funcPassManager, /*useFuseTensorPadWithConsumerPass=*/true);
- auto &nestedModulePM = pm.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(
- createFoldAffineMinInDistributedLoopsPass());
- nestedModulePM.addPass(memref::createResolveShapedTypeResultDimsPass());
+ funcPassManager.addPass(createFoldAffineMinInDistributedLoopsPass());
+ funcPassManager.addPass(memref::createResolveShapedTypeResultDimsPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Tile to GPU invocations and vectorize.
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUCreateFastSlowPathPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createSPIRVTilePass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createGPUCreateFastSlowPathPass());
+ funcPassManager.addPass(createSPIRVTilePass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
{
GenericVectorizationPassOptions options;
options.vectorizeGatherAccesses = true;
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGenericVectorizationPass(options));
+ funcPassManager.addPass(createGenericVectorizationPass(options));
}
- addSPIRVVectorLoweringPasses(nestedModulePM);
- nestedModulePM.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ addSPIRVVectorLoweringPasses(funcPassManager);
+ funcPassManager.addPass(createForOpCanonicalizationPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Bufferize and distribute.
- addSPIRVBufferizePasses(nestedModulePM, gpuAllocateFunctionMemoryFn);
+ addSPIRVBufferizePasses(funcPassManager, gpuAllocateFunctionMemoryFn);
// Generate loop nests for all remaining ops and remove trivial loops.
- addLoopMaterializationPasses(nestedModulePM);
+ addLoopMaterializationPasses(funcPassManager);
// Perform various vector-level cross-op optimizations like load-store
// forwarding, shape casting and casting op cancelling.
- nestedModulePM.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass(
+ funcPassManager.addPass(createOptimizeVectorTransferPass(
/*flatten=*/false, /*dropUnitDims=*/false));
}
-void addSPIRVWinogradVectorizePassPipeline(OpPassManager &pm) {
+void addSPIRVWinogradVectorizePassPipeline(OpPassManager &funcPassManager) {
addTileAndDistributeToWorkgroupsPasses(
- pm, /*useFuseTensorPadWithConsumerPass=*/true);
+ funcPassManager, /*useFuseTensorPadWithConsumerPass=*/true);
- auto &nestedModulePM = pm.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(
IREE::LinalgExt::createTileAndDecomposeWinogradTransformPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createFoldAffineMinInDistributedLoopsPass());
- nestedModulePM.addPass(memref::createResolveShapedTypeResultDimsPass());
+ funcPassManager.addPass(createFoldAffineMinInDistributedLoopsPass());
+ funcPassManager.addPass(memref::createResolveShapedTypeResultDimsPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Tile to GPU invocations and vectorize.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createSPIRVAnnotateWinogradLoopsPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createSPIRVAnnotateWinogradLoopsPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
{
GenericVectorizationPassOptions options;
options.vectorizeGatherAccesses = true;
options.enableCleanup = true;
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGenericVectorizationPass(options));
+ funcPassManager.addPass(createGenericVectorizationPass(options));
}
- addSPIRVVectorLoweringPasses(nestedModulePM);
- nestedModulePM.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ addSPIRVVectorLoweringPasses(funcPassManager);
+ funcPassManager.addPass(createForOpCanonicalizationPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Bufferize and distribute.
- addSPIRVBufferizePasses(nestedModulePM, gpuAllocateFunctionMemoryFn);
+ addSPIRVBufferizePasses(funcPassManager, gpuAllocateFunctionMemoryFn);
// Generate loop nests for all remaining ops and remove trivial loops.
- addLoopMaterializationPasses(nestedModulePM);
+ addLoopMaterializationPasses(funcPassManager);
// Perform various vector-level cross-op optimizations like load-store
// forwarding, shape casting and casting op cancelling.
- nestedModulePM.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass(
+ funcPassManager.addPass(createOptimizeVectorTransferPass(
/*flatten=*/false, /*dropUnitDims=*/false));
}
-void addSPIRVCooperativeMatrixVectorizePassPipeline(OpPassManager &pm,
- unsigned pipelineDepth,
- unsigned storeStage) {
+void addSPIRVCooperativeMatrixVectorizePassPipeline(
+ OpPassManager &funcPassManager, unsigned pipelineDepth,
+ unsigned storeStage) {
addTileAndDistributeToWorkgroupsPasses(
- pm, /*useFuseTensorPadWithConsumerPass=*/false,
+ funcPassManager, /*useFuseTensorPadWithConsumerPass=*/false,
/*useWARForCooperativeMatrixCodegen=*/true);
- auto &nestedModulePM = pm.nest<ModuleOp>();
-
- addBufferizePasses(nestedModulePM, gpuAllocateWorkgroupMemoryFn);
+ addBufferizePasses(funcPassManager, gpuAllocateWorkgroupMemoryFn);
// Tile to GPU workgroups and promote.
- nestedModulePM.addNestedPass<func::FuncOp>(createSPIRVTileAndPromotePass(
+ funcPassManager.addPass(createSPIRVTileAndPromotePass(
/*promoteCMatrix=*/true, /*skipThreadLevel=*/true));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRemoveSingleIterationLoopPass());
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
// Run canonicalization patterns to propagate constant shape sizes after
// removing trip-one loops.
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Tile and distribute to GPU subgroups.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createSPIRVTileToCooperativeOpsPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRemoveSingleIterationLoopPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createSPIRVTileToCooperativeOpsPass());
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Multi-buffer depending on pipeline depth and distribute to shared memory.
if (pipelineDepth > 0) {
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGPUMultiBuffering(pipelineDepth + 1));
+ funcPassManager.addPass(createGPUMultiBuffering(pipelineDepth + 1));
}
- nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGPUDistributeSharedMemoryCopy());
+ funcPassManager.addPass(createMemrefCopyToLinalgPass());
+ funcPassManager.addPass(createGPUDistributeSharedMemoryCopy());
// Reduce bank conflicts by padding.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGPUReduceSharedMemoryBankConflicts(
- detail::bankConflictReductionPaddingBits));
+ funcPassManager.addPass(createGPUReduceSharedMemoryBankConflicts(
+ detail::bankConflictReductionPaddingBits));
// Performs high-level n-D mechanical vectorization. This does not perform
// unrolling or lowering, which is done later.
{
GenericVectorizationPassOptions options;
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGenericVectorizationPass(options));
+ funcPassManager.addPass(createGenericVectorizationPass(options));
}
// With subview ops, vector hoisting won't kick in. So fold memref subview ops
// before performing vector unrolling and hoisting.
- nestedModulePM.addNestedPass<func::FuncOp>(
- memref::createFoldMemRefAliasOpsPass());
+ funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
// Vectorize to cooperative ops.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createSPIRVVectorizeToCooperativeOpsPass());
- nestedModulePM.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRemoveSingleIterationLoopPass());
+ funcPassManager.addPass(createSPIRVVectorizeToCooperativeOpsPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
// Run canonicalization patterns to propagate constant shape sizes after
// removing trip-one loops.
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Perform various vector-level cross-op optimizations like load-store
// forwarding, shape casting and casting op cancelling.
- nestedModulePM.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass(
+ funcPassManager.addPass(createOptimizeVectorTransferPass(
/*flatten=*/false, /*dropUnitDims=*/false));
- nestedModulePM.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createSPIRVVectorToGPUSubgroupMMAOpsPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
- addSPIRVVectorLoweringPasses(nestedModulePM);
+ funcPassManager.addPass(createForOpCanonicalizationPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createSPIRVVectorToGPUSubgroupMMAOpsPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
+ addSPIRVVectorLoweringPasses(funcPassManager);
if (pipelineDepth > 0) {
PipeliningSchedulingStrategy schedule =
storeStage == 0 ? PipeliningSchedulingStrategy::loadStoreStage0
: PipeliningSchedulingStrategy::loadGlobalStage0;
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUPipeliningPass(
+ funcPassManager.addPass(createGPUPipeliningPass(
/*epiloguePeeling=*/true, pipelineDepth, schedule));
}
}
-void addSPIRVMatmulPromoteVectorizePassPipeline(OpPassManager &topPM,
+void addSPIRVMatmulPromoteVectorizePassPipeline(OpPassManager &funcPassManager,
unsigned pipelineDepth,
unsigned storeStage) {
// Guards against 0 for consistency with older user provided tuning configs.
@@ -499,15 +470,13 @@
LLVM_DEBUG(llvm::dbgs() << "Non-zero Pipeline Depth: " << pipelineDepth
<< "\n";);
addTileAndDistributeToWorkgroupsPasses(
- topPM, /*useFuseTensorPadWithConsumerPass=*/false,
+ funcPassManager, /*useFuseTensorPadWithConsumerPass=*/false,
/*useWARForCooperativeMatrixCodegen=*/true);
// Promote to workgroups and tile to threads.
- auto &nestedPM = topPM.nest<ModuleOp>();
- nestedPM.addNestedPass<func::FuncOp>(createGPUTensorTileToSerialLoops());
- nestedPM.addNestedPass<func::FuncOp>(createGPUTensorAlloc());
- nestedPM.addNestedPass<func::FuncOp>(
- createGPUTensorTile(/*distributeToWarp=*/false));
+ funcPassManager.addPass(createGPUTensorTileToSerialLoops());
+ funcPassManager.addPass(createGPUTensorAlloc());
+ funcPassManager.addPass(createGPUTensorTile(/*distributeToWarp=*/false));
// Performs high-level n-D mechanical vectorization. This does not perform
// unrolling or lowering, which is done later.
@@ -517,74 +486,69 @@
options.vectorizeGatherAccesses = true;
options.enableCleanup = false;
options.maxVectorSize = 4096;
- nestedPM.addNestedPass<func::FuncOp>(
- createGenericVectorizationPass(options));
- nestedPM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
- nestedPM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedPM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createGenericVectorizationPass(options));
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
// Bufferize.
- addBufferizePasses(nestedPM, gpuAllocateWorkgroupMemoryFn);
+ addBufferizePasses(funcPassManager, gpuAllocateWorkgroupMemoryFn);
// Distribute scf.forall to GPU threads.
- nestedPM.addNestedPass<func::FuncOp>(createGPUDistribute());
+ funcPassManager.addPass(createGPUDistribute());
if (pipelineDepth > 1 || storeStage == 0) {
- nestedPM.addNestedPass<func::FuncOp>(createGPUMultiBuffering(
+ funcPassManager.addPass(createGPUMultiBuffering(
storeStage == 0 ? pipelineDepth + 1 : pipelineDepth));
}
- nestedPM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
- nestedPM.addNestedPass<func::FuncOp>(createGPUDistributeSharedMemoryCopy());
- nestedPM.addPass(createCanonicalizerPass());
- nestedPM.addPass(createCSEPass());
+ funcPassManager.addPass(createMemrefCopyToLinalgPass());
+ funcPassManager.addPass(createGPUDistributeSharedMemoryCopy());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
- nestedPM.addNestedPass<func::FuncOp>(createGPUReduceSharedMemoryBankConflicts(
+ funcPassManager.addPass(createGPUReduceSharedMemoryBankConflicts(
detail::bankConflictReductionPaddingBits));
// With subview ops, vector hoisting won't kick in. So fold memref subview ops
// before performing vector unrolling and hoisting.
- nestedPM.addNestedPass<func::FuncOp>(memref::createFoldMemRefAliasOpsPass());
+ funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
- nestedPM.addNestedPass<func::FuncOp>(createSPIRVInitialVectorLoweringPass());
- nestedPM.addPass(createCSEPass());
- nestedPM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
- nestedPM.addNestedPass<func::FuncOp>(createSPIRVFinalVectorLoweringPass());
+ funcPassManager.addPass(createSPIRVInitialVectorLoweringPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createSPIRVFinalVectorLoweringPass());
- nestedPM.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
- nestedPM.addPass(createCanonicalizerPass());
- nestedPM.addPass(createCSEPass());
- nestedPM.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass(
+ funcPassManager.addPass(createForOpCanonicalizationPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createOptimizeVectorTransferPass(
/*flatten=*/false, /*dropUnitDims=*/false));
// Hoist loop invariant code to avoid pipelining it.
- nestedPM.addNestedPass<func::FuncOp>(createLoopInvariantCodeMotionPass());
+ funcPassManager.addPass(createLoopInvariantCodeMotionPass());
PipeliningSchedulingStrategy schedule =
storeStage == 0 ? PipeliningSchedulingStrategy::loadStoreStage0
: PipeliningSchedulingStrategy::loadGlobalStage0;
- nestedPM.addNestedPass<func::FuncOp>(createGPUPipeliningPass(
+ funcPassManager.addPass(createGPUPipeliningPass(
/*epiloguePeeling=*/true, pipelineDepth, schedule));
- addLoopMaterializationPasses(nestedPM);
+ addLoopMaterializationPasses(funcPassManager);
}
-void addSPIRVSubgroupReducePassPipeline(OpPassManager &pm) {
+void addSPIRVSubgroupReducePassPipeline(OpPassManager &funcPassManager) {
addTileAndDistributeToWorkgroupsPasses(
- pm, /*useFuseTensorPadWithConsumerPass=*/true);
+ funcPassManager, /*useFuseTensorPadWithConsumerPass=*/true);
- auto &nestedModulePM = pm.nest<ModuleOp>();
// Fuse input parallel ops into the reduction op so that we don't need to
// create temporary allocations during bufferization.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRematerializeParallelOpsPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+ funcPassManager.addPass(createRematerializeParallelOpsPass());
+ funcPassManager.addPass(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUTileReductionPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createGPUTileReductionPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Performs high-level n-D mechanical vectorization. This does not perform
// unrolling or lowering, which is done later.
@@ -596,109 +560,115 @@
options.vectorizeGatherAccesses = true;
options.enableCleanup = false;
options.generateContract = false;
- nestedModulePM.addNestedPass<func::FuncOp>(
- createGenericVectorizationPass(options));
- nestedModulePM.addNestedPass<func::FuncOp>(
- createOptimizeTensorInsertExtractSlicesPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createGenericVectorizationPass(options));
+ funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLoopInvariantCodeMotionPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
+ funcPassManager.addPass(createLoopInvariantCodeMotionPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
// Bufferize and distribute.
// We bufferize before distributing to threads there; so we are still at the
// block level. Therefore, need to allocate workgroup memory.
- addSPIRVBufferizePasses(nestedModulePM, gpuAllocateWorkgroupMemoryFn);
+ addSPIRVBufferizePasses(funcPassManager, gpuAllocateWorkgroupMemoryFn);
// Perform various vector-level cross-op optimizations like load-store
// forwarding, shape casting and casting op cancelling.
- nestedModulePM.addNestedPass<func::FuncOp>(createOptimizeVectorTransferPass(
+ funcPassManager.addPass(createOptimizeVectorTransferPass(
/*flatten=*/false, /*dropUnitDims=*/false));
// Simplify the IR for vector distribution.
- nestedModulePM.addNestedPass<func::FuncOp>(
- memref::createFoldMemRefAliasOpsPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createLoopInvariantCodeMotionPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createForOpCanonicalizationPass());
- nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+ funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
+ funcPassManager.addPass(createLoopInvariantCodeMotionPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createForOpCanonicalizationPass());
+ funcPassManager.addPass(createCanonicalizerPass());
- auto getWarpSize = [](mlir::FunctionOpInterface func) {
- auto moduleOp = func->getParentOfType<ModuleOp>();
- spirv::TargetEnvAttr target = getSPIRVTargetEnvAttr(moduleOp);
- return target.getResourceLimits().getSubgroupSize();
+ auto getWarpSize = [](mlir::FunctionOpInterface func) -> int {
+ // TODO: This kind of call back function is a really really bad idea
+ // This should be easier to resolve than doing this.
+ std::optional<int64_t> subgroupSize = getSPIRVSubgroupSize(func);
+ return subgroupSize.value_or(32);
};
// Handle vector reduction operations specifically.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createConvertVectorReductionToGPUPass(/*expandSubgroupReduction=*/false,
- getWarpSize));
+ funcPassManager.addPass(createConvertVectorReductionToGPUPass(
+ /*expandSubgroupReduction=*/false, getWarpSize));
// Perform normal vector unrolling and lowering transformations. This breaks
// vectors down to native machine size.
- addSPIRVVectorLoweringPasses(nestedModulePM);
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
-}
-
-void addSPIRVTransformDialectPassPipeline(OpPassManager &pm,
- StringRef entryPoint) {
- addSPIRVTransformDialectPasses(pm, entryPoint);
-
- // Run GenericVectorization pass additionally to convert vectors into forms
- // needed for SPIR-V.
- auto &nestedModulePM = pm.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(createGenericVectorizationPass());
- addSPIRVVectorLoweringPasses(nestedModulePM);
+ addSPIRVVectorLoweringPasses(funcPassManager);
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
}
//===----------------------------------------------------------------------===//
// Entry Point
//===----------------------------------------------------------------------===//
-void buildSPIRVCodegenConfigurationPassPipeline(OpPassManager &pm) {
- auto &nestedModulePM = pm.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(createGPUGeneralizeNamedOpsPass());
- addCommonTargetExecutablePreprocessingPasses(pm);
- pm.addPass(createSPIRVSelectLoweringStrategyPass());
+static void buildSPIRVCodegenConfigurationPassPipelineImpl(
+ OpPassManager &modulePassManager) {
+ {
+ FunctionLikeNest funcPassManager(modulePassManager);
+ funcPassManager.addPass(createGPUGeneralizeNamedOpsPass);
+ addCommonTargetExecutablePreprocessingPasses(funcPassManager);
+ }
+ modulePassManager.addPass(createMaterializeUserConfigsPass());
+
+ modulePassManager.addPass(createSPIRVSelectLoweringStrategyPass());
}
-void buildSPIRVCodegenPassPipeline(OpPassManager &pm) {
- pm.addPass(createSPIRVLowerExecutableTargetPass());
+void buildSPIRVCodegenConfigurationPassPipeline(
+ OpPassManager &variantPassManager) {
+ OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
+ buildSPIRVCodegenConfigurationPassPipelineImpl(modulePassManager);
+}
- addMemRefLoweringPasses(pm.nest<ModuleOp>());
- addSPIRVLoweringPasses(pm.nest<ModuleOp>());
+void buildSPIRVCodegenPassPipeline(OpPassManager &variantPassManager) {
+ {
+ OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
+ modulePassManager.addPass(
+ createSPIRVLowerExecutableUsingTransformDialectPass());
+ FunctionLikeNest(modulePassManager)
+ .addPass(createSPIRVLowerExecutableTargetPass);
+ addMemRefLoweringPasses(modulePassManager);
+ }
+ variantPassManager.addPass(createReconcileTranslationInfoPass());
+
+ {
+ OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
+ addSPIRVLoweringPasses(modulePassManager);
+ }
LLVM_DEBUG({
llvm::dbgs() << "Using SPIR-V pass pipeline:\n";
- pm.printAsTextualPipeline(llvm::dbgs());
+ variantPassManager.printAsTextualPipeline(llvm::dbgs());
llvm::dbgs() << "\n";
});
}
// NOTE: this runs on the top-level program module containing all hal.executable
// ops.
-void buildSPIRVLinkingPassPipeline(OpPassManager &passManager) {
- auto &nestedExecutablePM = passManager.nest<IREE::HAL::ExecutableOp>();
+void buildSPIRVLinkingPassPipeline(OpPassManager &modulePassManager) {
+ auto &executablePassManager =
+ modulePassManager.nest<IREE::HAL::ExecutableOp>();
// Trim the allowed target environment (version/capability/extension/etc.) to
// the minimal requirement needed by compiled spirv.module ops. This helps to
// increase the chance of linking different variant ops together.
- nestedExecutablePM.addNestedPass<IREE::HAL::ExecutableVariantOp>(
+ executablePassManager.addNestedPass<IREE::HAL::ExecutableVariantOp>(
createSPIRVTrimExecutableTargetEnvPass());
// Materialize the minimal required target environment into proper device
// queries to execute in the runtime.
- nestedExecutablePM.addNestedPass<IREE::HAL::ExecutableVariantOp>(
+ executablePassManager.addNestedPass<IREE::HAL::ExecutableVariantOp>(
createSPIRVMaterializeExecutableConditionsPass());
// Link together executables. This may produce some IR duplication.
- passManager.addPass(createSPIRVLinkExecutablesPass());
+ modulePassManager.addPass(createSPIRVLinkExecutablesPass());
// Cleanup IR duplication.
- passManager.addNestedPass<IREE::HAL::ExecutableOp>(
+ modulePassManager.addNestedPass<IREE::HAL::ExecutableOp>(
mlir::createCanonicalizerPass());
}
@@ -717,16 +687,17 @@
static PassPipelineRegistration<> SPIRVConfigPipeline(
"iree-codegen-spirv-configuration-pipeline",
- "Runs the pipeline for configuring the lowering from linalg to SPIR-V",
- [](OpPassManager &passManager) {
- buildSPIRVCodegenConfigurationPassPipeline(passManager);
+ "Runs the pipeline for configuring the lowering from linalg to SPIR-V on "
+ "all functions in a module",
+ [](OpPassManager &modulePassManager) {
+ buildSPIRVCodegenConfigurationPassPipelineImpl(modulePassManager);
});
static PassPipelineRegistration<> LinalgSPIRVPipeline(
"iree-codegen-linalg-to-spirv-pipeline",
"Runs the progressive lowering pipeline from linalg to SPIR-V",
- [](OpPassManager &passManager) {
- buildSPIRVCodegenPassPipeline(passManager);
+ [](OpPassManager &variantPassManager) {
+ buildSPIRVCodegenPassPipeline(variantPassManager);
});
}
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.h b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.h
index 28c4aee..9fd67ac 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.h
@@ -23,48 +23,47 @@
/// Pass pipeline to lower IREE HAL executables without any tiling and
/// distribution.
-void addSPIRVBaseLoweringPassPipeline(OpPassManager &pm);
+void addSPIRVBaseLoweringPassPipeline(OpPassManager &funcPassManager);
/// Pass pipeline to lower IREE HAL executables by tiling and distributing to
/// workgroups and invocations. Each invocation handles a scalar.
-void addSPIRVBaseDistributePassPipeline(OpPassManager &pm);
+void addSPIRVBaseDistributePassPipeline(OpPassManager &funcPassManager);
-void addSPIRVBaseVectorizePassPipeline(OpPassManager &pm);
+void addSPIRVBaseVectorizePassPipeline(OpPassManager &funcPassManager);
-void addSPIRVCooperativeMatrixVectorizePassPipeline(OpPassManager &pm,
- unsigned pipelineDepth,
- unsigned storeStage);
+/// Adds passes to lower vector ops to meet SPIR-V requirements.
+void addSPIRVVectorLoweringPasses(OpPassManager &funcPassManager);
-void addSPIRVMatmulPromoteVectorizePassPipeline(OpPassManager &pm,
+void addSPIRVCooperativeMatrixVectorizePassPipeline(
+ OpPassManager &funcPassManager, unsigned pipelineDepth,
+ unsigned storeStage);
+
+void addSPIRVMatmulPromoteVectorizePassPipeline(OpPassManager &funcPassManager,
unsigned pipelineDepth,
unsigned storeStage);
/// Pass pipeline to lower IREE HAL executables by tiling and distributing
/// reduction to workgroups and then subgroups.
-void addSPIRVSubgroupReducePassPipeline(OpPassManager &pm);
-
-/// Pass pipeline to lower IREE HAL executables via transform dialect schedules.
-void addSPIRVTransformDialectPassPipeline(OpPassManager &pm,
- StringRef entryPoint);
+void addSPIRVSubgroupReducePassPipeline(OpPassManager &funcPassManager);
/// Pass pipeline to lower winograd ops. This pipeline follows the
/// SPIRVBaseVectorize pipeline with the following exception:
/// Since the ops are already tiled, we skip tiling and instead
/// just annotate the loops with the spirv distribute attribute.
///
-void addSPIRVWinogradVectorizePassPipeline(OpPassManager &pm);
+void addSPIRVWinogradVectorizePassPipeline(OpPassManager &funcPassManager);
/// Populates passes needed to preprocess the input variant before lowering
/// and select lowering strategies.
-void buildSPIRVCodegenConfigurationPassPipeline(OpPassManager &pm);
+void buildSPIRVCodegenConfigurationPassPipeline(
+ OpPassManager &variantPassManager);
/// Populates passes needed to lower linalg/arith/math ops to SPIR-V ops via
-/// the structured ops path. The pass manager `pm` here operate on the module
-/// within the IREE::HAL::ExecutableOp.
-void buildSPIRVCodegenPassPipeline(OpPassManager &pm);
+/// the structured ops path.
+void buildSPIRVCodegenPassPipeline(OpPassManager &variantPassManager);
/// Populates passes needed to link HAL executables across SPIRV targets.
-void buildSPIRVLinkingPassPipeline(OpPassManager &passManager);
+void buildSPIRVLinkingPassPipeline(OpPassManager &modulePassManager);
//===---------------------------------------------------------------------===//
// SPIR-V passes
@@ -78,79 +77,86 @@
std::unique_ptr<OperationPass<ModuleOp>>
createConvertToSPIRVPass(unsigned indexWidth = 32);
-/// Annotates the innermost Winograd loops with the spirv distribute attribute.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+/// Annotates the innermost Winograd loops with the spirv distribute
+/// attribute.
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSPIRVAnnotateWinogradLoopsPass();
/// Breaks down large vectors not natively supported by SPIR-V.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSPIRVBreakDownLargeVectorPass();
/// Pass to distribute tiled loop nests to invocations.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createSPIRVDistributePass();
+std::unique_ptr<InterfacePass<FunctionOpInterface>> createSPIRVDistributePass();
/// Emulates bfloat 16 ops with 32-bit float ops.
-std::unique_ptr<OperationPass<ModuleOp>> createSPIRVEmulateBf16Pass();
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
+createSPIRVEmulateBf16Pass();
/// Emulates 64-bit integer ops with 32-bit integer ops.
-std::unique_ptr<OperationPass<ModuleOp>> createSPIRVEmulateI64Pass();
+std::unique_ptr<InterfacePass<FunctionOpInterface>> createSPIRVEmulateI64Pass();
/// Turns static shaped storage buffer subspan ops into dynamic shaped ones.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSPIRVEraseStorageBufferStaticShapePass();
/// Pass to perform final vector ops lowering to meet SPIR-V requirements.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSPIRVFinalVectorLoweringPass();
/// Creates a pass to fold processor ID uses where possible.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSPIRVFoldProcessorIDUsesPass();
/// Pass to perform initial vector ops lowering to meet SPIR-V requirements.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSPIRVInitialVectorLoweringPass();
/// Links SPIR-V HAL executables within the top-level program module.
-std::unique_ptr<OperationPass<mlir::ModuleOp>> createSPIRVLinkExecutablesPass();
+std::unique_ptr<OperationPass<ModuleOp>> createSPIRVLinkExecutablesPass();
/// Pass to set the lowering strategy for the target variant.
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
createSPIRVSelectLoweringStrategyPass();
/// Main pass to lower executables to scalar + vector code on SPIR-V path.
/// Invokes one of the pass pipelines that translate the executable to
/// scalar + vector code.
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSPIRVLowerExecutableTargetPass();
+/// Pass to lower executables using Transform dialect on the SPIR-V backend.
+/// This shouldnt be a separate pass, but it is since there are some
+/// extra spir-v passes that need to be run as well.
+std::unique_ptr<OperationPass<ModuleOp>>
+createSPIRVLowerExecutableUsingTransformDialectPass();
+
/// Pass to map MemRef memory spaces to SPIR-V storage classes.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSPIRVMapMemRefStorageClassPass();
-/// Pass to materialize SPIR-V target requirements of hal.exectuable.variant ops
-/// into hal.executable.condition regions.
+/// Pass to materialize SPIR-V target requirements of hal.exectuable.variant
+/// ops into hal.executable.condition regions.
std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
createSPIRVMaterializeExecutableConditionsPass();
/// Pass to tile and distribute Linalg ops with buffer semantics to
/// invocations.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSPIRVTileAndDistributePass();
/// Pass to promote Linalg ops with buffer semantics to use workgroup memory
/// and then tile to invocations.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSPIRVTileAndPromotePass(bool promoteCMatrix = false,
bool skipThreadLevel = false);
/// Pass to tile Linalg ops with tensor semantics to invocations.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>> createSPIRVTilePass();
+std::unique_ptr<InterfacePass<FunctionOpInterface>> createSPIRVTilePass();
/// Pass to tile Linalg ops with buffer semantics suitable for lowering to
/// SPIR-V cooperative ops.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSPIRVTileToCooperativeOpsPass();
// Trims the SPIR-V target environment of a HAL executable variant to the
@@ -159,16 +165,17 @@
createSPIRVTrimExecutableTargetEnvPass();
/// Converts vector ops to gpu subgroup MMA ops.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSPIRVVectorToGPUSubgroupMMAOpsPass();
/// Converts memref of scalar to memref of vector of efficent size. This will
/// allow to convert memory accesses to vector load/store in SPIR-V without
/// having pointer bitcast.
-std::unique_ptr<OperationPass<ModuleOp>> createSPIRVVectorizeLoadStore();
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
+createSPIRVVectorizeLoadStore();
/// Pass to do vectorization suitable for lowering to SPIR-V cooperative ops.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSPIRVVectorizeToCooperativeOpsPass();
/// Pass pipeline to lower IREE HAL executables by tiling and distributing to
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.td b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.td
index 8990965..3ac49b7 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.td
@@ -39,7 +39,7 @@
}
def SPIRVEmulateI64 :
- Pass<"iree-spirv-emulate-i64", "ModuleOp"> {
+ InterfacePass<"iree-spirv-emulate-i64", "mlir::FunctionOpInterface"> {
let summary = "Emulate 64-bit integer ops with 32-bit integer ops";
let constructor = "mlir::iree_compiler::createSPIRVEmulateI64Pass()";
}
@@ -69,14 +69,21 @@
}
def SPIRVLowerExecutableTarget :
- Pass<"iree-spirv-lower-executable-target-pass",
- "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> {
+ InterfacePass<"iree-spirv-lower-executable-target-pass",
+ "mlir::FunctionOpInterface"> {
let summary = "Lower the executable target to SPIR-V using one of the "
"IREE::HAL::DispatchLoweringPassPipeline";
let constructor =
"mlir::iree_compiler::createSPIRVLowerExecutableTargetPass()";
}
+def SPIRVLowerExecutableUsingTransformDialect :
+ Pass<"iree-spirv-lower-executable-using-transform-dialect", "ModuleOp"> {
+ let summary = "Lower the executable target to SPIR-V using transform dialect"
+ " followed by some passes to do SPIR-V specific vectorization";
+ let constructor = "mlir::iree_compiler::createSPIRVLowerExecutableUsingTransformDialectPass()";
+}
+
def SPIRVMapMemRefStorageClass :
InterfacePass<"iree-spirv-map-memref-storage-class", "mlir::FunctionOpInterface"> {
let summary = "Map MemRef memory spaces to SPIR-V storage classes";
@@ -93,8 +100,7 @@
}
def SPIRVSelectLoweringStrategy :
- Pass<"iree-spirv-select-lowering-strategy-pass",
- "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> {
+ Pass<"iree-spirv-select-lowering-strategy-pass", "ModuleOp"> {
let summary = "Select the IREE::HAL::DispatchLoweringPassPipeline for lowering"
"to SPIR-V";
let constructor =
@@ -145,7 +151,7 @@
}
def SPIRVVectorizeLoadStore :
- Pass<"iree-spirv-vectorize-load-store", "ModuleOp"> {
+ InterfacePass<"iree-spirv-vectorize-load-store", "mlir::FunctionOpInterface"> {
let summary = "Vectorize load/store of memrefs for better memory access";
let constructor = "mlir::iree_compiler::createSPIRVVectorizeLoadStore()";
}
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVEmulateI64.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVEmulateI64.cpp
index 9219588..9649e21 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVEmulateI64.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVEmulateI64.cpp
@@ -152,7 +152,7 @@
patterns.getContext());
}
-static bool supportsI64(ModuleOp op) {
+static bool supportsI64(FunctionOpInterface op) {
spirv::TargetEnvAttr attr = getSPIRVTargetEnvAttr(op);
assert(attr && "Not a valid spirv module");
spirv::TargetEnv env(attr);
@@ -170,7 +170,7 @@
}
void runOnOperation() override {
- ModuleOp op = getOperation();
+ auto op = getOperation();
if (supportsI64(op))
return;
@@ -229,7 +229,8 @@
// Public interface
//===----------------------------------------------------------------------===//
-std::unique_ptr<OperationPass<ModuleOp>> createSPIRVEmulateI64Pass() {
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
+createSPIRVEmulateI64Pass() {
return std::make_unique<SPIRVEmulateI64Pass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp
index bd02aec..ca33d80 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp
@@ -4,6 +4,7 @@
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "iree/compiler/Codegen/Common/PassUtils.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
#include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
@@ -15,6 +16,7 @@
#include "llvm/Support/Debug.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
@@ -57,19 +59,24 @@
} // namespace
void SPIRVLowerExecutableTargetPass::runOnOperation() {
- IREE::HAL::ExecutableVariantOp variantOp = getOperation();
+ auto funcOp = getOperation();
- std::optional<IREE::Codegen::TranslationInfoAttr> translationInfo =
- getIdenticalTranslationInfo(variantOp);
+ IREE::Codegen::TranslationInfoAttr translationInfo =
+ getTranslationInfo(funcOp);
if (!translationInfo) {
- variantOp.emitOpError(
- "unhandled compilation of entry point functions with different "
- "translation info");
- return signalPassFailure();
+ return;
}
- OpPassManager pipeline(IREE::HAL::ExecutableVariantOp::getOperationName());
- switch (translationInfo.value().getDispatchLoweringPassPipeline()) {
+ std::optional<OpPassManager> maybePipeline =
+ getFunctionOpInterfacePassManager(funcOp);
+ if (!maybePipeline) {
+ funcOp.emitOpError(
+ "unhandled function-like container during executable lowering");
+ return signalPassFailure();
+ }
+ OpPassManager &pipeline = maybePipeline.value();
+
+ switch (translationInfo.getDispatchLoweringPassPipeline()) {
case CodeGenPipeline::SPIRVBaseLowering:
addSPIRVBaseLoweringPassPipeline(pipeline);
break;
@@ -84,12 +91,12 @@
break;
case CodeGenPipeline::SPIRVCooperativeMatrixVectorize: {
FailureOr<int64_t> maybeDepth =
- getSoftwarePipelineDepth(translationInfo.value().getConfiguration());
- FailureOr<int64_t> maybeStage = getSoftwarePipelineStoreStage(
- translationInfo.value().getConfiguration());
+ getSoftwarePipelineDepth(translationInfo.getConfiguration());
+ FailureOr<int64_t> maybeStage =
+ getSoftwarePipelineStoreStage(translationInfo.getConfiguration());
if (failed(maybeDepth) || failed(maybeStage)) {
- variantOp.emitOpError("invalid cooperative matrix pipeline without "
- "software pipelining configuration.");
+ funcOp.emitOpError("invalid cooperative matrix pipeline without "
+ "software pipelining configuration.");
return signalPassFailure();
}
addSPIRVCooperativeMatrixVectorizePassPipeline(pipeline, *maybeDepth,
@@ -98,12 +105,12 @@
}
case CodeGenPipeline::SPIRVMatmulPromoteVectorize: {
FailureOr<int64_t> maybeDepth =
- getSoftwarePipelineDepth(translationInfo.value().getConfiguration());
- FailureOr<int64_t> maybeStage = getSoftwarePipelineStoreStage(
- translationInfo.value().getConfiguration());
+ getSoftwarePipelineDepth(translationInfo.getConfiguration());
+ FailureOr<int64_t> maybeStage =
+ getSoftwarePipelineStoreStage(translationInfo.getConfiguration());
if (failed(maybeDepth) || failed(maybeStage)) {
- variantOp.emitOpError(
- "invalid matmul pipeline without software pipelining configuration.");
+ funcOp.emitOpError("invalid matmul pipeline without software "
+ "pipelining configuration.");
return signalPassFailure();
}
addSPIRVMatmulPromoteVectorizePassPipeline(pipeline, *maybeDepth,
@@ -113,17 +120,11 @@
case CodeGenPipeline::SPIRVWinogradVectorize:
addSPIRVWinogradVectorizePassPipeline(pipeline);
break;
- case CodeGenPipeline::TransformDialectCodegen: {
- SymbolRefAttr codegenSpec = translationInfo.value().getCodegenSpec();
- addSPIRVTransformDialectPassPipeline(
- pipeline, codegenSpec ? codegenSpec.getLeafReference() : StringRef(""));
- break;
- }
// No pipeline specified, nothing to do.
case CodeGenPipeline::None:
return;
default:
- variantOp.emitOpError("unsupported pipeline on GPU target.");
+ funcOp.emitOpError("unsupported pipeline on GPU target.");
return signalPassFailure();
}
@@ -133,12 +134,12 @@
llvm::dbgs() << "\n";
});
- if (failed(runPipeline(pipeline, variantOp))) {
+ if (failed(runPipeline(pipeline, funcOp))) {
return signalPassFailure();
}
}
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createSPIRVLowerExecutableTargetPass() {
return std::make_unique<SPIRVLowerExecutableTargetPass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableUsingTransformDialect.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableUsingTransformDialect.cpp
new file mode 100644
index 0000000..efad52d
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableUsingTransformDialect.cpp
@@ -0,0 +1,77 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree-dialects/Dialect/LinalgTransform/Passes.h"
+#include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/SPIRV/PassDetail.h"
+#include "iree/compiler/Codegen/SPIRV/Passes.h"
+
+namespace mlir::iree_compiler {
+
+namespace {
+class SPIRVLowerExecutableUsingTransformDialectPass
+ : public SPIRVLowerExecutableUsingTransformDialectBase<
+ SPIRVLowerExecutableUsingTransformDialectPass> {
+public:
+ void runOnOperation() override;
+};
+} // namespace
+
+void SPIRVLowerExecutableUsingTransformDialectPass::runOnOperation() {
+ auto moduleOp = getOperation();
+ auto funcOps = moduleOp.getOps<FunctionOpInterface>();
+
+ if (funcOps.empty() || !llvm::hasSingleElement(funcOps)) {
+ // Can only handle dispatches with single functions on the transform dialect
+ // path.
+ return;
+ }
+
+ auto funcOp = *funcOps.begin();
+ IREE::Codegen::TranslationInfoAttr translationInfo =
+ getTranslationInfo(funcOp);
+ if (!translationInfo || translationInfo.getDispatchLoweringPassPipeline() !=
+ IREE::Codegen::DispatchLoweringPassPipeline::
+ TransformDialectCodegen) {
+ return;
+ }
+
+ // Run the interpreter and drop schedule passes.
+ SymbolRefAttr codegenSpec = translationInfo.getCodegenSpec();
+ StringRef entryPoint =
+ codegenSpec ? codegenSpec.getLeafReference() : StringRef("");
+ OpPassManager modulePassManager(ModuleOp::getOperationName());
+ modulePassManager.addPass(
+ iree_compiler::createTransformDialectInterpreterPass(entryPoint));
+ modulePassManager.addPass(createDropSchedulePass());
+
+ OpPassManager &funcPassManager = modulePassManager.nest<func::FuncOp>();
+ funcPassManager.addPass(createGenericVectorizationPass());
+ addSPIRVVectorLoweringPasses(funcPassManager);
+
+ if (failed(runPipeline(modulePassManager, moduleOp))) {
+ moduleOp.emitOpError("failed to run transform dialect passes");
+ return signalPassFailure();
+ }
+
+ // Make sure that the translation info is set to `None` to avoid using
+ // other pass pipelines.
+ auto translationInfoModified = getTranslationInfo(funcOp);
+ if (!translationInfoModified ||
+ translationInfoModified.getDispatchLoweringPassPipeline() !=
+ IREE::Codegen::DispatchLoweringPassPipeline::None) {
+ funcOp->emitOpError("expected transform dialect lowering to set the "
+ "translation_info to use None");
+ return signalPassFailure();
+ }
+}
+
+std::unique_ptr<OperationPass<ModuleOp>>
+createSPIRVLowerExecutableUsingTransformDialectPass() {
+ return std::make_unique<SPIRVLowerExecutableUsingTransformDialectPass>();
+}
+
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVSelectLoweringStrategy.cpp
index 7c42c24..e9d2be0 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVSelectLoweringStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVSelectLoweringStrategy.cpp
@@ -57,10 +57,10 @@
/// module.
template <typename F>
static LogicalResult
-verifyLoweringConfiguration(ModuleOp module,
+verifyLoweringConfiguration(FunctionOpInterface funcOp,
IREE::Codegen::TranslationInfoAttr translationInfo,
ArrayRef<int64_t> workgroupSize, F verificationFn) {
- auto walkResult = module.walk([&](Operation *op) -> WalkResult {
+ auto walkResult = funcOp.walk([&](Operation *op) -> WalkResult {
IREE::Codegen::LoweringConfigAttr loweringConfig = getLoweringConfig(op);
if (!loweringConfig)
return WalkResult::advance();
@@ -70,40 +70,27 @@
}
static LogicalResult
-verifyEntryPoint(ModuleOp moduleOp,
- IREE::Codegen::TranslationInfoAttr translationInfo,
- IREE::HAL::ExecutableExportOp exportOp) {
+verifyTranslationInfo(FunctionOpInterface funcOp,
+ IREE::Codegen::TranslationInfoAttr translationInfo) {
if (translationInfo.getDispatchLoweringPassPipeline() ==
CodeGenPipeline::TransformDialectCodegen) {
// Transform dialect encodes configuration into the schedule directly.
return success();
}
- std::optional<mlir::ArrayAttr> workgroupSizeAttr =
- exportOp.getWorkgroupSize();
- if (!workgroupSizeAttr || workgroupSizeAttr->size() != 3) {
- return moduleOp.emitError(
- "expected workgroup size to have three dimensions for SPIR-V "
- "pipelines");
- }
-
- std::array<int64_t, 3> workgroupSizes;
- for (auto [index, attr] : llvm::enumerate(workgroupSizeAttr.value())) {
- workgroupSizes[index] = llvm::cast<IntegerAttr>(attr).getInt();
- }
-
+ SmallVector<int64_t> workgroupSizes =
+ llvm::to_vector(translationInfo.getWorkgroupSize());
switch (translationInfo.getDispatchLoweringPassPipeline()) {
case CodeGenPipeline::SPIRVBaseVectorize:
- return verifyLoweringConfiguration(moduleOp, translationInfo,
- workgroupSizes,
+ return verifyLoweringConfiguration(funcOp, translationInfo, workgroupSizes,
verifySPIRVBaseVectorizePassPipeline);
case CodeGenPipeline::SPIRVMatmulPromoteVectorize:
return verifyLoweringConfiguration(
- moduleOp, translationInfo, workgroupSizes,
+ funcOp, translationInfo, workgroupSizes,
verifySPIRVMatmulPromoteVectorizePassPipeline);
case CodeGenPipeline::SPIRVCooperativeMatrixVectorize:
return verifyLoweringConfiguration(
- moduleOp, translationInfo, workgroupSizes,
+ funcOp, translationInfo, workgroupSizes,
verifySPIRVCooperativeMatrixVectorizePassPipeline);
default:
break;
@@ -112,31 +99,26 @@
}
void SPIRVSelectLoweringStrategyPass::runOnOperation() {
- IREE::HAL::ExecutableVariantOp variantOp = getOperation();
- ModuleOp moduleOp = variantOp.getInnerModule();
+ auto moduleOp = getOperation();
+ for (auto funcOp : moduleOp.getOps<FunctionOpInterface>()) {
+ if (failed(initSPIRVLaunchConfig(funcOp))) {
+ funcOp.emitOpError("failed to set lowering configuration");
+ return signalPassFailure();
+ }
- if (failed(initSPIRVLaunchConfig(moduleOp))) {
- return signalPassFailure();
- }
+ auto translationInfo = getTranslationInfo(funcOp);
+ if (!translationInfo) {
+ continue;
+ }
- std::optional<IREE::Codegen::TranslationInfoAttr> translationInfo =
- getIdenticalTranslationInfo(variantOp);
- if (!translationInfo) {
- moduleOp.emitOpError(
- "unhandled compilation of entry point functions with different "
- "translation info");
- return signalPassFailure();
- }
-
- // Verify the properties of each entry point based on the target pipeline.
- for (auto exportOp : variantOp.getExportOps()) {
- if (failed(verifyEntryPoint(moduleOp, translationInfo.value(), exportOp))) {
+ // Verify the properties of each entry point based on the target pipeline.
+ if (failed(verifyTranslationInfo(funcOp, translationInfo))) {
return signalPassFailure();
}
}
}
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
createSPIRVSelectLoweringStrategyPass() {
return std::make_unique<SPIRVSelectLoweringStrategyPass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndPromote.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndPromote.cpp
index cead85b..1f780ae 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndPromote.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndPromote.cpp
@@ -144,9 +144,6 @@
void SPIRVTileAndPromotePass::runOnOperation() {
MLIRContext *context = &getContext();
auto funcOp = getOperation();
- FailureOr<IREE::HAL::ExecutableExportOp> exportOp = getEntryPoint(funcOp);
- if (failed(exportOp))
- return;
auto threadTileComputeFn = getSPIRVTileSizeComputeFn(funcOp, 1);
if (failed(threadTileComputeFn))
@@ -191,9 +188,15 @@
llvm::dbgs() << "\n\n";
});
- auto workgroupSize = llvm::map_to_vector(
- exportOp->getWorkgroupSize().value(),
- [&](Attribute attr) { return llvm::cast<IntegerAttr>(attr).getInt(); });
+ std::optional<SmallVector<int64_t>> maybeWorkgroupSize =
+ getWorkgroupSize(funcOp);
+ if (!maybeWorkgroupSize) {
+ funcOp.emitOpError(
+ "failed to get workgroup size for tile and promote pass");
+ return signalPassFailure();
+ }
+
+ SmallVector<int64_t> &workgroupSize = maybeWorkgroupSize.value();
int64_t totalThreads = workgroupSize[0] * workgroupSize[1] * workgroupSize[2];
std::optional<int> subgroupSize = getSPIRVSubgroupSize(funcOp);
if (!subgroupSize) {
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp
index 8fb91b7..0743d97 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp
@@ -72,16 +72,19 @@
void setSPIRVCooperativeMatrixInfo(mlir::FunctionOpInterface funcOp,
linalg::LinalgOp rootOp,
ArrayRef<int64_t> shape) {
- auto moduleOp = funcOp->getParentOfType<ModuleOp>();
- auto exportOp = getAllEntryPoints(moduleOp).lookup(funcOp.getName());
+ auto exportOp = getEntryPoint(funcOp);
+ if (!exportOp) {
+ return;
+ }
Builder b(funcOp.getContext());
- exportOp->setAttr(coopMatShapeAttrName, b.getDenseI64ArrayAttr(shape));
+ exportOp.value()->setAttr(coopMatShapeAttrName,
+ b.getDenseI64ArrayAttr(shape));
auto inputType = cast<ShapedType>(rootOp.getDpsInputs().front().getType());
auto outputType = cast<ShapedType>(rootOp.getDpsInits().front().getType());
auto elementTypes = b.getTypeArrayAttr(
{inputType.getElementType(), outputType.getElementType()});
- exportOp->setAttr(coopMatTypeAttrName, elementTypes);
+ exportOp.value()->setAttr(coopMatTypeAttrName, elementTypes);
}
/// Returns the chosen cooperative matrix shape for CodeGen from the
@@ -89,9 +92,9 @@
/// ArrayRef if cannot query.
ArrayRef<int64_t>
getSPIRVCooperativeMatrixShape(mlir::FunctionOpInterface funcOp) {
- auto moduleOp = funcOp->getParentOfType<ModuleOp>();
- auto exportOp = getAllEntryPoints(moduleOp).lookup(funcOp.getName());
- auto attr = exportOp->getAttrOfType<DenseI64ArrayAttr>(coopMatShapeAttrName);
+ auto exportOp = getEntryPoint(funcOp);
+ auto attr =
+ exportOp.value()->getAttrOfType<DenseI64ArrayAttr>(coopMatShapeAttrName);
if (!attr)
return {};
return attr.asArrayRef();
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorizeLoadStore.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorizeLoadStore.cpp
index 301494a..1121c74 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorizeLoadStore.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorizeLoadStore.cpp
@@ -1034,22 +1034,20 @@
void SPIRVVectorizeLoadStorePass::runOnOperation() {
// Uses the signature conversion methodology of the dialect conversion
// framework to implement the conversion.
- ModuleOp module = getOperation();
+ auto funcOp = getOperation();
MLIRContext *context = &getContext();
// Prior pass should have unrolled and broken down vectors with rank > 1.
- for (auto func : module.getOps<mlir::FunctionOpInterface>()) {
- auto result = func.walk([](VectorTransferOpInterface transferOp) {
- if (cast<VectorType>(transferOp.getVectorType()).getRank() > 1) {
- transferOp.emitOpError(
- "with rank > 1 should be broken down by prior passes");
- return WalkResult::interrupt();
- }
- return WalkResult::advance();
- });
- if (result.wasInterrupted()) {
- signalPassFailure();
+ auto result = funcOp.walk([](VectorTransferOpInterface transferOp) {
+ if (cast<VectorType>(transferOp.getVectorType()).getRank() > 1) {
+ transferOp.emitOpError(
+ "with rank > 1 should be broken down by prior passes");
+ return WalkResult::interrupt();
}
+ return WalkResult::advance();
+ });
+ if (result.wasInterrupted()) {
+ signalPassFailure();
}
memrefUsageAnalysis = &getAnalysis<MemRefUsageAnalysis>();
@@ -1089,25 +1087,24 @@
[&](auto op) { return !memrefUsageAnalysis->shouldConvertTransfer(op); });
target.markUnknownOpDynamicallyLegal([&](Operation *op) { return true; });
- if (failed(applyPartialConversion(module, target,
+ if (failed(applyPartialConversion(funcOp, target,
std::move(conversionPatterns)))) {
return signalPassFailure();
}
- for (auto func : module.getOps<mlir::FunctionOpInterface>()) {
- RewritePatternSet rewritingPatterns(context);
- rewritingPatterns.add<ScalarizeVectorTransferRead, ScalarizeVectorLoad,
- ScalarizeVectorTransferWrite>(context);
- rewritingPatterns.add<ReifyExtractOfCreateMask>(context);
+ RewritePatternSet rewritingPatterns(context);
+ rewritingPatterns.add<ScalarizeVectorTransferRead, ScalarizeVectorLoad,
+ ScalarizeVectorTransferWrite>(context);
+ rewritingPatterns.add<ReifyExtractOfCreateMask>(context);
- if (failed(
- applyPatternsAndFoldGreedily(func, std::move(rewritingPatterns)))) {
- return signalPassFailure();
- }
+ if (failed(
+ applyPatternsAndFoldGreedily(funcOp, std::move(rewritingPatterns)))) {
+ return signalPassFailure();
}
}
-std::unique_ptr<OperationPass<ModuleOp>> createSPIRVVectorizeLoadStore() {
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
+createSPIRVVectorizeLoadStore() {
return std::make_unique<SPIRVVectorizeLoadStorePass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/Utils.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/Utils.cpp
index e2d27f4..c504dbf 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/Utils.cpp
@@ -36,10 +36,7 @@
const char *getSPIRVDistributeAttrName() { return "iree.spirv.distribute_dim"; }
DictionaryAttr getTargetConfigAttr(Operation *op) {
- auto variant = op->getParentOfType<IREE::HAL::ExecutableVariantOp>();
- if (!variant)
- return nullptr;
- IREE::HAL::ExecutableTargetAttr targetAttr = variant.getTarget();
+ auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(op);
if (!targetAttr)
return nullptr;
return targetAttr.getConfiguration();
@@ -61,14 +58,10 @@
}
std::optional<int> getSPIRVSubgroupSize(mlir::FunctionOpInterface funcOp) {
- auto moduleOp = funcOp->getParentOfType<ModuleOp>();
- llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps =
- getAllEntryPoints(moduleOp);
- auto exportOp = exportOps.lookup(funcOp.getName());
- if (!exportOp)
- return std::nullopt;
- if (auto size = exportOp.getSubgroupSize())
- return size->getSExtValue();
+ std::optional<int64_t> subgroupSize = getSubgroupSize(funcOp);
+ if (subgroupSize) {
+ return subgroupSize.value();
+ }
spirv::TargetEnvAttr target = getSPIRVTargetEnvAttr(funcOp);
if (!target)
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/Verifiers.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/Verifiers.cpp
index 2caf9b0..7c1b7e6 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/Verifiers.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/Verifiers.cpp
@@ -68,6 +68,11 @@
limits.getMaxComputeWorkgroupSize().getAsValueRange<IntegerAttr>(),
[](const APInt &dim) { return dim.getSExtValue(); });
+ if (workgroupSize.size() < 3) {
+ return funcOp->emitOpError("expected workgroup size to have three "
+ "dimensions for SPIR-V pipelines");
+ }
+
// Verify each dimension of workgroupSize should be power of two.
if (!llvm::isPowerOf2_64(workgroupSize[0]) ||
!llvm::isPowerOf2_64(workgroupSize[1]) ||
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir
index 0de5488..d7c32a6 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir
@@ -1,54 +1,31 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
// Conv - large OC - distribute to only one workgroup dimension.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @conv_112x112x512 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export public @conv_112x112x512 layout(#pipeline_layout)
- builtin.module {
- func.func @conv_112x112x512() {
- %c0 = arith.constant 0 : index
- %c512 = arith.constant 512 : index
- %c112 = arith.constant 112 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x3x512xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x512xf32>>
- %13 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>> -> tensor<1x225x225x3xf32>
- %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 512], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x3x512xf32>> -> tensor<3x3x3x512xf32>
- %22 = tensor.empty() : tensor<1x112x112x512xf32>
- %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
- %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%13, %15 : tensor<1x225x225x3xf32>, tensor<3x3x3x512xf32>) outs(%23 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
- flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 512], strides = [1, 1, 1, 1]
- : tensor<1x112x112x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x512xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], subgroup_size = 64>>}>
+module {
+ func.func @conv_112x112x512() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c512 = arith.constant 512 : index
+ %c112 = arith.constant 112 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x3x512xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x512xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>> -> tensor<1x225x225x3xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 512], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x3x512xf32>> -> tensor<3x3x3x512xf32>
+ %5 = tensor.empty() : tensor<1x112x112x512xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
+ %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x512xf32>) outs(%6 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 512], strides = [1, 1, 1, 1] : tensor<1x112x112x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x512xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 1, 8, 256], [1, 1, 8, 4], [0, 0, 0, 0, 1, 1, 4], [0, 1, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @conv_112x112x512
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [64, 1, 1]>
// CHECK: func.func @conv_112x112x512()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.conv_2d_nhwc_hwcf
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -56,53 +33,30 @@
// Conv - medium OC/OW/OH - distribute to two workgroup dimensions.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @conv_112x112x32 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export public @conv_112x112x32 layout(#pipeline_layout)
- builtin.module {
- func.func @conv_112x112x32() {
- %c0 = arith.constant 0 : index
- %c32 = arith.constant 32 : index
- %c112 = arith.constant 112 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x3x32xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x32xf32>>
- %13 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>> -> tensor<1x225x225x3xf32>
- %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x3x32xf32>> -> tensor<3x3x3x32xf32>
- %22 = tensor.empty() : tensor<1x112x112x32xf32>
- %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
- %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%13, %15 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%23 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
- flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1]
- : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x32xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], subgroup_size = 64>>}>
+module {
+ func.func @conv_112x112x32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c32 = arith.constant 32 : index
+ %c112 = arith.constant 112 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x3x32xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x32xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>> -> tensor<1x225x225x3xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x3x32xf32>> -> tensor<3x3x3x32xf32>
+ %5 = tensor.empty() : tensor<1x112x112x32xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
+ %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%6 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x32xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 4, 16, 32], [1, 4, 2, 4], [0, 0, 0, 0, 1, 1, 4], [0, 1, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @conv_112x112x32
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [8 : index, 8 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 8, 1]>
// CHECK: func.func @conv_112x112x32()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.conv_2d_nhwc_hwcf
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -110,53 +64,28 @@
// Conv - small OC/OW/OH - distribute to all three workgroup dimensions.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @conv_16x16x16 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export public @conv_16x16x16 layout(#pipeline_layout)
- builtin.module {
- func.func @conv_16x16x16() {
- %c0 = arith.constant 0 : index
- %c16 = arith.constant 16 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x33x33x3xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x16x16x16xf32>>
- %13 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 33, 33, 3], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x33x33x3xf32>> -> tensor<1x33x33x3xf32>
- %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>> -> tensor<3x3x3x16xf32>
- %22 = tensor.empty() : tensor<1x16x16x16xf32>
- %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
- %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%13, %15 : tensor<1x33x33x3xf32>, tensor<3x3x3x16xf32>) outs(%23 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
- flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 16, 16, 16], strides = [1, 1, 1, 1]
- : tensor<1x16x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x16x16x16xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], subgroup_size = 64>>}>
+module {
+ func.func @conv_16x16x16() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c16 = arith.constant 16 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x33x33x3xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x16x16x16xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 33, 33, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x33x33x3xf32>> -> tensor<1x33x33x3xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>> -> tensor<3x3x3x16xf32>
+ %5 = tensor.empty() : tensor<1x16x16x16xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
+ %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x33x33x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 16, 16, 16], strides = [1, 1, 1, 1] : tensor<1x16x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x16x16x16xf32>>
+ return
}
}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 8, 8, 16], [1, 2, 2, 4], [0, 0, 0, 0, 1, 1, 4], [0, 1, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @conv_16x16x16
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [4 : index, 4 : index, 4 : index]
-
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [4, 4, 4]>
// CHECK: func.func @conv_16x16x16()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.conv_2d_nhwc_hwcf
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -164,53 +93,30 @@
// Depthwise conv - small OC/OW/OH - distribute to all three workgroup dimensions.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @dwconv_28x28x144 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export public @dwconv_28x28x144 layout(#pipeline_layout)
- builtin.module {
- func.func @dwconv_28x28x144() {
- %c0 = arith.constant 0 : index
- %c144 = arith.constant 144 : index
- %c28 = arith.constant 28 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x57x57x144xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x144xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x28x28x144xf32>>
- %14 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 57, 57, 144], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x57x57x144xf32>> -> tensor<1x57x57x144xf32>
- %16 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 144], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x144xf32>> -> tensor<3x3x144xf32>
- %23 = tensor.empty() : tensor<1x28x28x144xf32>
- %24 = linalg.fill ins(%cst : f32) outs(%23 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
- %25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%14, %16 : tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) outs(%24 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
- flow.dispatch.tensor.store %25, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 144], strides = [1, 1, 1, 1]
- : tensor<1x28x28x144xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x28x28x144xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], subgroup_size = 64>>}>
+module {
+ func.func @dwconv_28x28x144() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c144 = arith.constant 144 : index
+ %c28 = arith.constant 28 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x57x57x144xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x144xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x28x28x144xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 57, 57, 144], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x57x57x144xf32>> -> tensor<1x57x57x144xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 144], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x144xf32>> -> tensor<3x3x144xf32>
+ %5 = tensor.empty() : tensor<1x28x28x144xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
+ %7 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) outs(%6 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 144], strides = [1, 1, 1, 1] : tensor<1x28x28x144xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x28x28x144xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 4, 4, 16], [1, 1, 1, 4], [0, 0, 0, 0, 1, 1], [0, 1, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @dwconv_28x28x144
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [4 : index, 4 : index, 4 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [4, 4, 4]>
// CHECK: func.func @dwconv_28x28x144()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.depthwise_conv_2d_nhwc_hwc
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -218,51 +124,28 @@
// Depthwise conv - tiny OC/OW/OH - starving the GPU.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @dwconv_4x4x8 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export public @dwconv_4x4x8 layout(#pipeline_layout)
- builtin.module {
- func.func @dwconv_4x4x8() {
- %c0 = arith.constant 0 : index
- %c8 = arith.constant 8 : index
- %c4 = arith.constant 4 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x9x9x8xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x8xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x4x4x8xf32>>
- %14 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x9x9x8xf32>> -> tensor<1x9x9x8xf32>
- %16 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 8], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x8xf32>> -> tensor<3x3x8xf32>
- %23 = tensor.empty() : tensor<1x4x4x8xf32>
- %24 = linalg.fill ins(%cst : f32) outs(%23 : tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
- %25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%14, %16 : tensor<1x9x9x8xf32>, tensor<3x3x8xf32>) outs(%24 : tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
- flow.dispatch.tensor.store %25, %2, offsets = [0, 0, 0, 0], sizes = [1, 4, 4, 8], strides = [1, 1, 1, 1]
- : tensor<1x4x4x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x4x4x8xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], subgroup_size = 64>>}>
+module {
+ func.func @dwconv_4x4x8() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c8 = arith.constant 8 : index
+ %c4 = arith.constant 4 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x9x9x8xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x8xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x4x4x8xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x9x9x8xf32>> -> tensor<1x9x9x8xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 8], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x8xf32>> -> tensor<3x3x8xf32>
+ %5 = tensor.empty() : tensor<1x4x4x8xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
+ %7 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x9x9x8xf32>, tensor<3x3x8xf32>) outs(%6 : tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 4, 4, 8], strides = [1, 1, 1, 1] : tensor<1x4x4x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x4x4x8xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 4, 4, 8], [1, 1, 1, 4], [0, 0, 0, 0, 1, 1], [0, 1, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @dwconv_4x4x8
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [2 : index, 4 : index, 4 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [2, 4, 4]>
// CHECK: func.func @dwconv_4x4x8()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.depthwise_conv_2d_nhwc_hwc
// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir
index e48b5fd..ee8eded 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir
@@ -1,162 +1,90 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
// Large matmul that can match the best tiling scheme.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_1024x2048x512 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @matmul_1024x2048x512 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_1024x2048x512() {
- %c0 = arith.constant 0 : index
- %c2048 = arith.constant 2048 : index
- %c1024 = arith.constant 1024 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x2048xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1024x2048xf32>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<512x2048xf32>> -> tensor<512x2048xf32>
- %15 = tensor.empty() : tensor<1024x2048xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
- %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
- ins(%8, %10 : tensor<1024x512xf32>, tensor<512x2048xf32>) outs(%16 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1]
- : tensor<1024x2048xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x2048xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], subgroup_size = 64>>}>
+module {
+ func.func @matmul_1024x2048x512() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c2048 = arith.constant 2048 : index
+ %c1024 = arith.constant 1024 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x2048xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1024x2048xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xf32>> -> tensor<512x2048xf32>
+ %5 = tensor.empty() : tensor<1024x2048xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
+ %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<1024x512xf32>, tensor<512x2048xf32>) outs(%6 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1] : tensor<1024x2048xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x2048xf32>>
+ return
}
}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 128], [16, 4], [0, 0, 4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @matmul_1024x2048x512
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [32, 2, 1]>
// CHECK: func.func @matmul_1024x2048x512()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Small matmul N that can still tile to all threads in a workgroup.
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_3136x24x96 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @matmul_3136x24x96 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_3136x24x96() {
- %c0 = arith.constant 0 : index
- %c24 = arith.constant 24 : index
- %c3136 = arith.constant 3136 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3136x96xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<96x24xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<3136x24xf32>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3136, 96], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3136x96xf32>> -> tensor<3136x96xf32>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [96, 24], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<96x24xf32>> -> tensor<96x24xf32>
- %15 = tensor.empty() : tensor<3136x24xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
- %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
- ins(%8, %10 : tensor<3136x96xf32>, tensor<96x24xf32>) outs(%16 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [3136, 24], strides = [1, 1]
- : tensor<3136x24xf32> -> !flow.dispatch.tensor<writeonly:tensor<3136x24xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], subgroup_size = 64>>}>
+module {
+ func.func @matmul_3136x24x96() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c24 = arith.constant 24 : index
+ %c3136 = arith.constant 3136 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3136x96xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<96x24xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<3136x24xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3136, 96], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3136x96xf32>> -> tensor<3136x96xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [96, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<96x24xf32>> -> tensor<96x24xf32>
+ %5 = tensor.empty() : tensor<3136x24xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
+ %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<3136x96xf32>, tensor<96x24xf32>) outs(%6 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [3136, 24], strides = [1, 1] : tensor<3136x24xf32> -> !flow.dispatch.tensor<writeonly:tensor<3136x24xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[448, 8], [14, 4], [0, 0, 4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @matmul_3136x24x96
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [2 : index, 32 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [2, 32, 1]>
// CHECK: func.func @matmul_3136x24x96()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Small matmul M that can still tile to all threads in a workgroup.
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_196x64x192 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @matmul_196x64x192 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_196x64x192() {
- %c0 = arith.constant 0 : index
- %c64 = arith.constant 64 : index
- %c196 = arith.constant 196 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<196x192xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<192x64xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<196x64xf32>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [196, 192], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<196x192xf32>> -> tensor<196x192xf32>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [192, 64], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<192x64xf32>> -> tensor<192x64xf32>
- %15 = tensor.empty() : tensor<196x64xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<196x64xf32>) -> tensor<196x64xf32>
- %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
- ins(%8, %10 : tensor<196x192xf32>, tensor<192x64xf32>) outs(%16 : tensor<196x64xf32>) -> tensor<196x64xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [196, 64], strides = [1, 1]
- : tensor<196x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<196x64xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], subgroup_size = 64>>}>
+module {
+ func.func @matmul_196x64x192() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c64 = arith.constant 64 : index
+ %c196 = arith.constant 196 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<196x192xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<192x64xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<196x64xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [196, 192], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<196x192xf32>> -> tensor<196x192xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [192, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<192x64xf32>> -> tensor<192x64xf32>
+ %5 = tensor.empty() : tensor<196x64xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<196x64xf32>) -> tensor<196x64xf32>
+ %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<196x192xf32>, tensor<192x64xf32>) outs(%6 : tensor<196x64xf32>) -> tensor<196x64xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [196, 64], strides = [1, 1] : tensor<196x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<196x64xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[28, 64], [7, 4], [0, 0, 8]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @matmul_196x64x192
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [16 : index, 4 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [16, 4, 1]>
// CHECK: func.func @matmul_196x64x192()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -164,45 +92,26 @@
// Small matmul K that can still tile to all threads in a workgroup.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_12544x96x16 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @matmul_12544x96x16 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_12544x96x16() {
- %c0 = arith.constant 0 : index
- %c96 = arith.constant 96 : index
- %c12544 = arith.constant 12544 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12544x16xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x96xf32>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<12544x96xf32>
- linalg.fill ins(%cst : f32) outs(%2 : memref<12544x96xf32>)
- linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%0, %1 : memref<12544x16xf32>, memref<16x96xf32>) outs(%2 : memref<12544x96xf32>)
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], subgroup_size = 64>>}>
+module {
+ func.func @matmul_12544x96x16() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c96 = arith.constant 96 : index
+ %c12544 = arith.constant 12544 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12544x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x96xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<12544x96xf32>
+ linalg.fill ins(%cst : f32) outs(%2 : memref<12544x96xf32>)
+ linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%0, %1 : memref<12544x16xf32>, memref<16x96xf32>) outs(%2 : memref<12544x96xf32>)
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128, 32], [16, 4], [0, 0, 4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @matmul_12544x96x16
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [8 : index, 8 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 8, 1]>
// CHECK: func.func @matmul_12544x96x16()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -210,51 +119,30 @@
// Odd matmul M and small N that cannot utilize all threads in a workgroup.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_49x160x576 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @matmul_49x160x576 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_49x160x576() {
- %c0 = arith.constant 0 : index
- %c160 = arith.constant 160 : index
- %c49 = arith.constant 49 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<49x576xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<576x160xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<49x160xf32>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [49, 576], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<49x576xf32>> -> tensor<49x576xf32>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 160], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<576x160xf32>> -> tensor<576x160xf32>
- %15 = tensor.empty() : tensor<49x160xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<49x160xf32>) -> tensor<49x160xf32>
- %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
- ins(%8, %10 : tensor<49x576xf32>, tensor<576x160xf32>) outs(%16 : tensor<49x160xf32>) -> tensor<49x160xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [49, 160], strides = [1, 1]
- : tensor<49x160xf32> -> !flow.dispatch.tensor<writeonly:tensor<49x160xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], subgroup_size = 64>>}>
+module {
+ func.func @matmul_49x160x576() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c160 = arith.constant 160 : index
+ %c49 = arith.constant 49 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<49x576xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<576x160xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<49x160xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [49, 576], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<49x576xf32>> -> tensor<49x576xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 160], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<576x160xf32>> -> tensor<576x160xf32>
+ %5 = tensor.empty() : tensor<49x160xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<49x160xf32>) -> tensor<49x160xf32>
+ %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<49x576xf32>, tensor<576x160xf32>) outs(%6 : tensor<49x160xf32>) -> tensor<49x160xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [49, 160], strides = [1, 1] : tensor<49x160xf32> -> !flow.dispatch.tensor<writeonly:tensor<49x160xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[7, 32], [7, 4], [0, 0, 8]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @matmul_49x160x576
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [8 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 1, 1]>
// CHECK: func.func @matmul_49x160x576()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -262,53 +150,30 @@
// Large batch matmul.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @batch_matmul_4x384x384 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @batch_matmul_4x384x384 layout(#pipeline_layout)
- builtin.module {
- func.func @batch_matmul_4x384x384() {
- %c0 = arith.constant 0 : index
- %c384 = arith.constant 384 : index
- %c4 = arith.constant 4 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x384x32xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x32x384xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4x384x384xf32>>
- %11 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 384, 32], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<4x384x32xf32>> -> tensor<4x384x32xf32>
- %14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 384], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<4x32x384xf32>> -> tensor<4x32x384xf32>
- %21 = tensor.empty() : tensor<4x384x384xf32>
- %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
- %23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"}
- ins(%11, %14 : tensor<4x384x32xf32>, tensor<4x32x384xf32>) outs(%22 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
- flow.dispatch.tensor.store %23, %2, offsets = [0, 0, 0], sizes = [4, 384, 384], strides = [1, 1, 1]
- : tensor<4x384x384xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x384x384xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], subgroup_size = 64>>}>
+module {
+ func.func @batch_matmul_4x384x384() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c384 = arith.constant 384 : index
+ %c4 = arith.constant 4 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x384x32xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x32x384xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4x384x384xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 384, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x384x32xf32>> -> tensor<4x384x32xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x32x384xf32>> -> tensor<4x32x384xf32>
+ %5 = tensor.empty() : tensor<4x384x384xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
+ %7 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<4x384x32xf32>, tensor<4x32x384xf32>) outs(%6 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [4, 384, 384], strides = [1, 1, 1] : tensor<4x384x384xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x384x384xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 32, 128], [1, 16, 4], [0, 0, 0, 4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @batch_matmul_4x384x384
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [32, 2, 1]>
// CHECK: func.func @batch_matmul_4x384x384()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.batch_matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -316,52 +181,29 @@
// Small batch matmul.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @batch_matmul_4x8x8 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @batch_matmul_4x8x8 layout(#pipeline_layout)
- builtin.module {
- func.func @batch_matmul_4x8x8() {
- %c0 = arith.constant 0 : index
- %c8 = arith.constant 8 : index
- %c4 = arith.constant 4 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x8x32xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x32x8xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4x8x8xf32>>
- %11 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 8, 32], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<4x8x32xf32>> -> tensor<4x8x32xf32>
- %14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 8], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<4x32x8xf32>> -> tensor<4x32x8xf32>
- %21 = tensor.empty() : tensor<4x8x8xf32>
- %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<4x8x8xf32>) -> tensor<4x8x8xf32>
- %23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"}
- ins(%11, %14 : tensor<4x8x32xf32>, tensor<4x32x8xf32>) outs(%22 : tensor<4x8x8xf32>) -> tensor<4x8x8xf32>
- flow.dispatch.tensor.store %23, %2, offsets = [0, 0, 0], sizes = [4, 8, 8], strides = [1, 1, 1]
- : tensor<4x8x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x8x8xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Qualcomm:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], subgroup_size = 64>>}>
+module {
+ func.func @batch_matmul_4x8x8() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c8 = arith.constant 8 : index
+ %c4 = arith.constant 4 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x8x32xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x32x8xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4x8x8xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 8, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x8x32xf32>> -> tensor<4x8x32xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 8], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x32x8xf32>> -> tensor<4x32x8xf32>
+ %5 = tensor.empty() : tensor<4x8x8xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4x8x8xf32>) -> tensor<4x8x8xf32>
+ %7 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<4x8x32xf32>, tensor<4x32x8xf32>) outs(%6 : tensor<4x8x8xf32>) -> tensor<4x8x8xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [4, 8, 8], strides = [1, 1, 1] : tensor<4x8x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x8x8xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 8, 8], [1, 1, 4], [0, 0, 0, 16]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @batch_matmul_4x8x8
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [2 : index, 8 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [2, 8, 1]>
// CHECK: func.func @batch_matmul_4x8x8()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.batch_matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_conv.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_conv.mlir
index d0a17c7..fa81773 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_conv.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_conv.mlir
@@ -1,58 +1,34 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-hal.executable private @nhwc_conv_pointwise_2x64x64x320 {
- hal.executable.variant public @vulkan_spirv_fb target(#hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader], []>, AMD:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>
- }>) {
- hal.executable.export public @nhwc_conv_pointwise_2x64x64x320 layout(#pipeline_layout)
- builtin.module {
- func.func @nhwc_conv_pointwise_2x64x64x320() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x66x66x320xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3x3x320x320xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x64x64x320xf16>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x64x64x320xf16>>
- %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 66, 66, 320], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x66x66x320xf16>> -> tensor<2x66x66x320xf16>
- %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 320, 320], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x320x320xf16>> -> tensor<3x3x320x320xf16>
- %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 64, 64, 320], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x64x64x320xf16>> -> tensor<2x64x64x320xf16>
- %7 = tensor.empty() : tensor<2x64x64x320xf16>
- %8 = linalg.fill ins(%cst : f16) outs(%7 : tensor<2x64x64x320xf16>) -> tensor<2x64x64x320xf16>
- %9 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
- ins(%4, %5 : tensor<2x66x66x320xf16>, tensor<3x3x320x320xf16>) outs(%8 : tensor<2x64x64x320xf16>) -> tensor<2x64x64x320xf16>
- %10 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
- iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
- ins(%9, %6 : tensor<2x64x64x320xf16>, tensor<2x64x64x320xf16>) outs(%7 : tensor<2x64x64x320xf16>) {
- ^bb0(%in: f16, %in_0: f16, %out: f16):
- %11 = arith.divf %in, %in_0 : f16
- linalg.yield %11 : f16
- } -> tensor<2x64x64x320xf16>
- flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0, 0], sizes = [2, 64, 64, 320], strides = [1, 1, 1, 1] : tensor<2x64x64x320xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x64x64x320xf16>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader], []>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+module {
+ func.func @nhwc_conv_pointwise_2x64x64x320() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x66x66x320xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3x3x320x320xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x64x64x320xf16>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x64x64x320xf16>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 66, 66, 320], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x66x66x320xf16>> -> tensor<2x66x66x320xf16>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 320, 320], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x320x320xf16>> -> tensor<3x3x320x320xf16>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 64, 64, 320], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x64x64x320xf16>> -> tensor<2x64x64x320xf16>
+ %7 = tensor.empty() : tensor<2x64x64x320xf16>
+ %8 = linalg.fill ins(%cst : f16) outs(%7 : tensor<2x64x64x320xf16>) -> tensor<2x64x64x320xf16>
+ %9 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%4, %5 : tensor<2x66x66x320xf16>, tensor<3x3x320x320xf16>) outs(%8 : tensor<2x64x64x320xf16>) -> tensor<2x64x64x320xf16>
+ %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %6 : tensor<2x64x64x320xf16>, tensor<2x64x64x320xf16>) outs(%7 : tensor<2x64x64x320xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %out: f16):
+ %11 = arith.divf %in, %in_0 : f16
+ linalg.yield %11 : f16
+ } -> tensor<2x64x64x320xf16>
+ flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0, 0], sizes = [2, 64, 64, 320], strides = [1, 1, 1, 1] : tensor<2x64x64x320xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x64x64x320xf16>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 1, 64, 64], [1, 1, 8, 8], [0, 0, 0, 0, 1, 1, 8], [0, 1, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @nhwc_conv_pointwise_2x64x64x320
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [8 : index, 8 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 8, 1]>
// CHECK: func.func @nhwc_conv_pointwise_2x64x64x320()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.conv_2d_nhwc_hwcf
// CHECK-SAME: lowering_config = #[[CONFIG]]
-
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul.mlir
index 985f8a8..e578411 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul.mlir
@@ -1,262 +1,170 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @batch_matmul_f32_16x4096x40x4096 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader], []>, AMD:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @batch_matmul_f32_16x4096x40x4096 layout(#pipeline_layout)
- builtin.module {
- func.func @batch_matmul_f32_16x4096x40x4096() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x4096x4096xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x4096x40xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<16x4096x40xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x4096x4096xf32>> -> tensor<16x4096x4096xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 4096, 40], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x4096x40xf32>> -> tensor<16x4096x40xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 4096, 40], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<16x4096x40xf32>> -> tensor<16x4096x40xf32>
- %6 = linalg.batch_matmul ins(%3, %4 : tensor<16x4096x4096xf32>, tensor<16x4096x40xf32>) outs(%5 : tensor<16x4096x40xf32>) -> tensor<16x4096x40xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0], sizes = [16, 4096, 40], strides = [1, 1, 1] : tensor<16x4096x40xf32> -> !flow.dispatch.tensor<readwrite:tensor<16x4096x40xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader], []>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+module {
+ func.func @batch_matmul_f32_16x4096x40x4096() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x4096x4096xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x4096x40xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<16x4096x40xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x4096x4096xf32>> -> tensor<16x4096x4096xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 4096, 40], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x4096x40xf32>> -> tensor<16x4096x40xf32>
+ %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 4096, 40], strides = [1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<16x4096x40xf32>> -> tensor<16x4096x40xf32>
+ %6 = linalg.batch_matmul ins(%3, %4 : tensor<16x4096x4096xf32>, tensor<16x4096x40xf32>) outs(%5 : tensor<16x4096x40xf32>) -> tensor<16x4096x40xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0], sizes = [16, 4096, 40], strides = [1, 1, 1] : tensor<16x4096x40xf32> -> !flow.dispatch.tensor<readwrite:tensor<16x4096x40xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 512, 8, 16]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
-// CHECK: hal.executable.export public @batch_matmul_f32_16x4096x40x4096
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [2 : index, 64 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [2, 64, 1], {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
// CHECK: func.func @batch_matmul_f32_16x4096x40x4096()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.batch_matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
-// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_f16_64x640x320 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16], []>, AMD:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @matmul_f16_64x640x320 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_f16_64x640x320() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64x320xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<320x640xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x640xf16>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 320], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x320xf16>> -> tensor<64x320xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [320, 640], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<320x640xf16>> -> tensor<320x640xf16>
- %5 = tensor.empty() : tensor<64x640xf16>
- %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x640xf16>) -> tensor<64x640xf16>
- %7 = linalg.matmul ins(%3, %4 : tensor<64x320xf16>, tensor<320x640xf16>) outs(%6 : tensor<64x640xf16>) -> tensor<64x640xf16>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 640], strides = [1, 1] : tensor<64x640xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x640xf16>>
- return
- }
- }
+// -----
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16], []>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+module {
+ func.func @matmul_f16_64x640x320() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<64x320xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<320x640xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<64x640xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 320], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x320xf16>> -> tensor<64x320xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [320, 640], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<320x640xf16>> -> tensor<320x640xf16>
+ %5 = tensor.empty() : tensor<64x640xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x640xf16>) -> tensor<64x640xf16>
+ %7 = linalg.matmul ins(%3, %4 : tensor<64x320xf16>, tensor<320x640xf16>) outs(%6 : tensor<64x640xf16>) -> tensor<64x640xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 640], strides = [1, 1] : tensor<64x640xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x640xf16>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 128, 32]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 2 : i64, store_stage = 0 : i64}>
-// CHECK: hal.executable.export public @matmul_f16_64x640x320
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [16 : index, 16 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [16, 16, 1], {pipeline_depth = 2 : i64, store_stage = 0 : i64}>
// CHECK: func.func @matmul_f16_64x640x320()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @batch_matmul_f32_16x4096x40x4096 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader], []>, AMD:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @batch_matmul_f32_16x4096x40x4096 layout(#pipeline_layout)
- builtin.module {
- func.func @batch_matmul_f32_16x4096x40x4096() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x4096x4096xf32>>
- %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x4096x48xf32>>
- %8 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x4096x48xf32>>
- %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x4096x4096xf32>> -> tensor<16x4096x4096xf32>
- %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0], sizes = [16, 4096, 48], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x4096x48xf32>> -> tensor<16x4096x48xf32>
- %11 = tensor.empty() : tensor<16x4096x48xf32>
- %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<16x4096x48xf32>) -> tensor<16x4096x48xf32>
- %13 = linalg.batch_matmul ins(%9, %10 : tensor<16x4096x4096xf32>, tensor<16x4096x48xf32>) outs(%12 : tensor<16x4096x48xf32>) -> tensor<16x4096x48xf32>
- flow.dispatch.tensor.store %13, %8, offsets = [0, 0, 0], sizes = [16, 4096, 48], strides = [1, 1, 1] : tensor<16x4096x48xf32> -> !flow.dispatch.tensor<writeonly:tensor<16x4096x48xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader], []>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+module {
+ func.func @batch_matmul_f32_16x4096x40x4096() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x4096x4096xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x4096x48xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x4096x48xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x4096x4096xf32>> -> tensor<16x4096x4096xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 4096, 48], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x4096x48xf32>> -> tensor<16x4096x48xf32>
+ %5 = tensor.empty() : tensor<16x4096x48xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<16x4096x48xf32>) -> tensor<16x4096x48xf32>
+ %7 = linalg.batch_matmul ins(%3, %4 : tensor<16x4096x4096xf32>, tensor<16x4096x48xf32>) outs(%6 : tensor<16x4096x48xf32>) -> tensor<16x4096x48xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [16, 4096, 48], strides = [1, 1, 1] : tensor<16x4096x48xf32> -> !flow.dispatch.tensor<writeonly:tensor<16x4096x48xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 256, 16, 32]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
-// CHECK: hal.executable.export public @batch_matmul_f32_16x4096x40x4096
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [4 : index, 32 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [4, 32, 1], {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
// CHECK: func.func @batch_matmul_f32_16x4096x40x4096()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.batch_matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
-// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @batch_matmul_f16_1x4096x4096x512 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader], []>, AMD:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @batch_matmul_f16_1x4096x4096x512 layout(#pipeline_layout)
- builtin.module {
- func.func @batch_matmul_f16_1x4096x4096x512() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x4096x512xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x512x4096xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x4096x4096xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 4096, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096x512xf16>> -> tensor<1x4096x512xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 512, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x512x4096xf16>> -> tensor<1x512x4096xf16>
- %5 = tensor.empty() : tensor<1x4096x4096xf32>
- %6 = tensor.empty() : tensor<1x4096x4096xf16>
- %7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<1x4096x4096xf16>) -> tensor<1x4096x4096xf16>
- %8 = linalg.batch_matmul ins(%3, %4 : tensor<1x4096x512xf16>, tensor<1x512x4096xf16>) outs(%7 : tensor<1x4096x4096xf16>) -> tensor<1x4096x4096xf16>
- %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8 : tensor<1x4096x4096xf16>) outs(%5 : tensor<1x4096x4096xf32>) {
- ^bb0(%in: f16, %out: f32):
- %10 = arith.extf %in : f16 to f32
- linalg.yield %10 : f32
- } -> tensor<1x4096x4096xf32>
- flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0], sizes = [1, 4096, 4096], strides = [1, 1, 1] : tensor<1x4096x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x4096x4096xf32>>
- return
- }
- }
+// -----
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader], []>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+module {
+ func.func @batch_matmul_f16_1x4096x4096x512() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x4096x512xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x512x4096xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x4096x4096xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 4096, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096x512xf16>> -> tensor<1x4096x512xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 512, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x512x4096xf16>> -> tensor<1x512x4096xf16>
+ %5 = tensor.empty() : tensor<1x4096x4096xf32>
+ %6 = tensor.empty() : tensor<1x4096x4096xf16>
+ %7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<1x4096x4096xf16>) -> tensor<1x4096x4096xf16>
+ %8 = linalg.batch_matmul ins(%3, %4 : tensor<1x4096x512xf16>, tensor<1x512x4096xf16>) outs(%7 : tensor<1x4096x4096xf16>) -> tensor<1x4096x4096xf16>
+ %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8 : tensor<1x4096x4096xf16>) outs(%5 : tensor<1x4096x4096xf32>) {
+ ^bb0(%in: f16, %out: f32):
+ %10 = arith.extf %in : f16 to f32
+ linalg.yield %10 : f32
+ } -> tensor<1x4096x4096xf32>
+ flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0], sizes = [1, 4096, 4096], strides = [1, 1, 1] : tensor<1x4096x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x4096x4096xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 256, 32]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
-// CHECK: hal.executable.export public @batch_matmul_f16_1x4096x4096x512
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 8 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [32, 8, 1], {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
// CHECK: func.func @batch_matmul_f16_1x4096x4096x512()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.batch_matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
-
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_multi_reduce_i4xf32xf32 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader], []>, AMD:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @matmul_multi_reduce_i4xf32xf32 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_multi_reduce_i4xf32xf32() {
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = hal.interface.constant.load[4] : i32
- %5 = arith.index_castui %0 : i32 to index
- %6 = arith.index_castui %1 : i32 to index
- %7 = arith.index_castui %2 : i32 to index
- %8 = arith.index_castui %3 : i32 to index
- %9 = arith.index_castui %4 : i32 to index
- %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<11008x32x128xi4>>
- %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<11008x32xf32>>
- %12 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<11008x32xf32>>
- %13 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x32x128xf32>>
- %14 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%9) : !flow.dispatch.tensor<writeonly:tensor<512x11008xf32>>
- %15 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [11008, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<11008x32x128xi4>> -> tensor<11008x32x128xi4>
- %16 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11008x32xf32>> -> tensor<11008x32xf32>
- %17 = flow.dispatch.tensor.load %12, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11008x32xf32>> -> tensor<11008x32xf32>
- %18 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [512, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<512x32x128xf32>> -> tensor<512x32x128xf32>
- %19 = tensor.empty() : tensor<512x11008xf32>
- %20 = tensor.empty() : tensor<11008x32x128xf32>
- %21 = linalg.fill ins(%cst : f32) outs(%19 : tensor<512x11008xf32>) -> tensor<512x11008xf32>
- %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%15, %16, %17 : tensor<11008x32x128xi4>, tensor<11008x32xf32>, tensor<11008x32xf32>) outs(%20 : tensor<11008x32x128xf32>) {
- ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
- %24 = arith.extui %in : i4 to i32
- %25 = arith.uitofp %24 : i32 to f32
- %26 = arith.subf %25, %in_1 : f32
- %27 = arith.mulf %26, %in_0 : f32
- linalg.yield %27 : f32
- } -> tensor<11008x32x128xf32>
- %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%18, %22 : tensor<512x32x128xf32>, tensor<11008x32x128xf32>) outs(%21 : tensor<512x11008xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %24 = arith.mulf %in, %in_0 : f32
- %25 = arith.addf %24, %out : f32
- linalg.yield %25 : f32
- } -> tensor<512x11008xf32>
- flow.dispatch.tensor.store %23, %14, offsets = [0, 0], sizes = [512, 11008], strides = [1, 1] : tensor<512x11008xf32> -> !flow.dispatch.tensor<writeonly:tensor<512x11008xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader], []>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>
+#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+module {
+ func.func @matmul_multi_reduce_i4xf32xf32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = hal.interface.constant.load[4] : i32
+ %5 = arith.index_castui %0 : i32 to index
+ %6 = arith.index_castui %1 : i32 to index
+ %7 = arith.index_castui %2 : i32 to index
+ %8 = arith.index_castui %3 : i32 to index
+ %9 = arith.index_castui %4 : i32 to index
+ %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<11008x32x128xi4>>
+ %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<11008x32xf32>>
+ %12 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<11008x32xf32>>
+ %13 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<512x32x128xf32>>
+ %14 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%9) : !flow.dispatch.tensor<writeonly:tensor<512x11008xf32>>
+ %15 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [11008, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<11008x32x128xi4>> -> tensor<11008x32x128xi4>
+ %16 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11008x32xf32>> -> tensor<11008x32xf32>
+ %17 = flow.dispatch.tensor.load %12, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11008x32xf32>> -> tensor<11008x32xf32>
+ %18 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [512, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<512x32x128xf32>> -> tensor<512x32x128xf32>
+ %19 = tensor.empty() : tensor<512x11008xf32>
+ %20 = tensor.empty() : tensor<11008x32x128xf32>
+ %21 = linalg.fill ins(%cst : f32) outs(%19 : tensor<512x11008xf32>) -> tensor<512x11008xf32>
+ %22 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%15, %16, %17 : tensor<11008x32x128xi4>, tensor<11008x32xf32>, tensor<11008x32xf32>) outs(%20 : tensor<11008x32x128xf32>) {
+ ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
+ %24 = arith.extui %in : i4 to i32
+ %25 = arith.uitofp %24 : i32 to f32
+ %26 = arith.subf %25, %in_1 : f32
+ %27 = arith.mulf %26, %in_0 : f32
+ linalg.yield %27 : f32
+ } -> tensor<11008x32x128xf32>
+ %23 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%18, %22 : tensor<512x32x128xf32>, tensor<11008x32x128xf32>) outs(%21 : tensor<512x11008xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %24 = arith.mulf %in, %in_0 : f32
+ %25 = arith.addf %24, %out : f32
+ linalg.yield %25 : f32
+ } -> tensor<512x11008xf32>
+ flow.dispatch.tensor.store %23, %14, offsets = [0, 0], sizes = [512, 11008], strides = [1, 1] : tensor<512x11008xf32> -> !flow.dispatch.tensor<writeonly:tensor<512x11008xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 128, 1, 16]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
-// CHECK: hal.executable.export public @matmul_multi_reduce_i4xf32xf32
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 8 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [32, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
// CHECK: func.func @matmul_multi_reduce_i4xf32xf32()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul_cooperative_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul_cooperative_ops.mlir
index ce4512d..f7c04fd 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul_cooperative_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul_cooperative_ops.mlir
@@ -1,218 +1,114 @@
-// RUN: iree-opt --split-input-file \
-// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' \
-// RUN: %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>]>>}>
#map = affine_map<(d0, d1) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-hal.executable public @matmul_256x1024x128_div_add {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64>
- >}>) {
- hal.executable.export public @matmul_256x1024x128_div_add layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_256x1024x128_div_add() {
- %c0 = arith.constant 0 : index
- %c1024 = arith.constant 1024 : index
- %c256 = arith.constant 256 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x128xf16>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x1024xf16>>
- %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
- %11 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>> -> tensor<256x1024xf16>
- %14 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>> -> tensor<256x1024xf16>
- %17 = tensor.empty() : tensor<256x1024xf16>
- %19 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 128], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x128xf16>> -> tensor<256x128xf16>
- %21 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<128x1024xf16>> -> tensor<128x1024xf16>
- %24 = tensor.empty() : tensor<256x1024xf16>
- %25 = linalg.fill ins(%cst : f16) outs(%24 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
- %26 = linalg.matmul ins(%19, %21 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%25 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
- %27 = linalg.generic {
- indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]}
- ins(%26, %11, %14 : tensor<256x1024xf16>, tensor<256x1024xf16>, tensor<256x1024xf16>)
- outs(%17 : tensor<256x1024xf16>) {
- ^bb0(%arg2: f16, %arg3: f16, %arg4: f16, %arg5: f16): // no predecessors
- %28 = arith.divf %arg2, %arg3 : f16
- %29 = arith.addf %28, %arg4 : f16
- linalg.yield %29 : f16
- } -> tensor<256x1024xf16>
- flow.dispatch.tensor.store %27, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : tensor<256x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
- return
- }
- }
+module {
+ func.func @matmul_256x1024x128_div_add() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c1024 = arith.constant 1024 : index
+ %c256 = arith.constant 256 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x128xf16>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x1024xf16>>
+ %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
+ %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>> -> tensor<256x1024xf16>
+ %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>> -> tensor<256x1024xf16>
+ %7 = tensor.empty() : tensor<256x1024xf16>
+ %8 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xf16>> -> tensor<256x128xf16>
+ %9 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x1024xf16>> -> tensor<128x1024xf16>
+ %10 = tensor.empty() : tensor<256x1024xf16>
+ %11 = linalg.fill ins(%cst : f16) outs(%10 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
+ %12 = linalg.matmul ins(%8, %9 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%11 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
+ %13 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%12, %5, %6 : tensor<256x1024xf16>, tensor<256x1024xf16>, tensor<256x1024xf16>) outs(%7 : tensor<256x1024xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
+ %14 = arith.divf %in, %in_0 : f16
+ %15 = arith.addf %14, %in_1 : f16
+ linalg.yield %15 : f16
+ } -> tensor<256x1024xf16>
+ flow.dispatch.tensor.store %13, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 128], [32, 64], [0, 0, 32], [16, 16, 16]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
-//CHECK-LABEL: hal.executable.export public @matmul_256x1024x128_div_add
-// CHECK-SAME: subgroup_size = 32 : index
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
// CHECK: func.func @matmul_256x1024x128_div_add()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-
-#map = affine_map<(d0, d1) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-hal.executable public @batch_matmul_16x128x256x512_div {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64>
- >}>) {
- hal.executable.export public @batch_matmul_16x128x256x512_div layout(#pipeline_layout)
- builtin.module {
- func.func @batch_matmul_16x128x256x512_div() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x128x512xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x512x256xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x128x256xf16>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x128x256xf16>>
- %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 128, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x128x512xf16>> -> tensor<16x128x512xf16>
- %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x512x256xf16>> -> tensor<16x512x256xf16>
- %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x128x256xf16>> -> tensor<16x128x256xf16>
- %7 = tensor.empty() : tensor<16x128x256xf16>
- %8 = linalg.fill ins(%cst : f16) outs(%7 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16>
- %9 = linalg.batch_matmul ins(%4, %5 : tensor<16x128x512xf16>, tensor<16x512x256xf16>) outs(%8 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16>
- %10 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%9, %6 : tensor<16x128x256xf16>, tensor<16x128x256xf16>) outs(%7 : tensor<16x128x256xf16>) {
- ^bb0(%in: f16, %in_0: f16, %out: f16):
- %11 = arith.divf %in, %in_0 : f16
- linalg.yield %11 : f16
- } -> tensor<16x128x256xf16>
- flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : tensor<16x128x256xf16> -> !flow.dispatch.tensor<writeonly:tensor<16x128x256xf16>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>]>>}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+module {
+ func.func @batch_matmul_16x128x256x512_div() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x128x512xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x512x256xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x128x256xf16>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x128x256xf16>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 128, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x128x512xf16>> -> tensor<16x128x512xf16>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x512x256xf16>> -> tensor<16x512x256xf16>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x128x256xf16>> -> tensor<16x128x256xf16>
+ %7 = tensor.empty() : tensor<16x128x256xf16>
+ %8 = linalg.fill ins(%cst : f16) outs(%7 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16>
+ %9 = linalg.batch_matmul ins(%4, %5 : tensor<16x128x512xf16>, tensor<16x512x256xf16>) outs(%8 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16>
+ %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%9, %6 : tensor<16x128x256xf16>, tensor<16x128x256xf16>) outs(%7 : tensor<16x128x256xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %out: f16):
+ %11 = arith.divf %in, %in_0 : f16
+ linalg.yield %11 : f16
+ } -> tensor<16x128x256xf16>
+ flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : tensor<16x128x256xf16> -> !flow.dispatch.tensor<writeonly:tensor<16x128x256xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
-//CHECK-LABEL: hal.executable.export public @batch_matmul_16x128x256x512_div
-// CHECK-SAME: subgroup_size = 32 : index
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
// CHECK: func.func @batch_matmul_16x128x256x512_div()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.batch_matmul
// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
// Linalg.generic that is a batch matmul.
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @generic_batch_matmul_32x8x512x64 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64>
- >}>) {
- hal.executable.export @generic_batch_matmul_32x8x512x64 layout(#pipeline_layout)
- builtin.module {
- func.func @generic_batch_matmul_32x8x512x64() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x32x64xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<32x64x512xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x128x512xf16>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x32x64xf16>> -> tensor<128x32x64xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [32, 64, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x64x512xf16>> -> tensor<32x64x512xf16>
- %5 = tensor.empty() : tensor<32x128x512xf16>
- %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<32x128x512xf16>) -> tensor<32x128x512xf16>
- %7 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
- ins(%3, %4 : tensor<128x32x64xf16>, tensor<32x64x512xf16>) outs(%6 : tensor<32x128x512xf16>)
- attrs = {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]} {
- ^bb0(%arg0: f16, %arg1: f16, %arg2: f16):
- %8 = arith.mulf %arg0, %arg1 : f16
- %9 = arith.addf %arg2, %8 : f16
- linalg.yield %9 : f16
- } -> tensor<32x128x512xf16>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [32, 128, 512], strides = [1, 1, 1] : tensor<32x128x512xf16> -> !flow.dispatch.tensor<writeonly:tensor<32x128x512xf16>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>]>>}>
+#map = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map4 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map5 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module {
+ func.func @generic_batch_matmul_32x8x512x64() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x32x64xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<32x64x512xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x128x512xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x32x64xf16>> -> tensor<128x32x64xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [32, 64, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x64x512xf16>> -> tensor<32x64x512xf16>
+ %5 = tensor.empty() : tensor<32x128x512xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<32x128x512xf16>) -> tensor<32x128x512xf16>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<128x32x64xf16>, tensor<32x64x512xf16>) outs(%6 : tensor<32x128x512xf16>) attrs = {linalg.memoized_indexing_maps = [#map3, #map4, #map5]} {
+ ^bb0(%in: f16, %in_0: f16, %out: f16):
+ %8 = arith.mulf %in, %in_0 : f16
+ %9 = arith.addf %out, %8 : f16
+ linalg.yield %9 : f16
+ } -> tensor<32x128x512xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [32, 128, 512], strides = [1, 1, 1] : tensor<32x128x512xf16> -> !flow.dispatch.tensor<writeonly:tensor<32x128x512xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
-//CHECK-LABEL: hal.executable.export public @generic_batch_matmul_32x8x512x64
-// CHECK-SAME: subgroup_size = 32 : index
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
// CHECK: func.func @generic_batch_matmul_32x8x512x64()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[$CONFIG]]
@@ -220,117 +116,54 @@
// K dim size not divisble by 32.
-#map = affine_map<(d0, d1) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-hal.executable public @batch_matmul_16x1024x1024x80 {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64>
- >}>) {
- hal.executable.export public @batch_matmul_16x1024x1024x80 layout(#pipeline_layout)
- builtin.module {
- func.func @batch_matmul_16x1024x1024x80() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x1024x80xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x80x1024xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x1024x1024xf16>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 1024, 80], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x1024x80xf16>> -> tensor<16x1024x80xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 80, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x80x1024xf16>> -> tensor<16x80x1024xf16>
- %5 = tensor.empty() : tensor<16x1024x1024xf16>
- %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16>
- %7 = linalg.batch_matmul ins(%3, %4 : tensor<16x1024x80xf16>, tensor<16x80x1024xf16>) outs(%6 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [16, 1024, 1024], strides = [1, 1, 1] : tensor<16x1024x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<16x1024x1024xf16>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>]>>}>
+module {
+ func.func @batch_matmul_16x1024x1024x80() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x1024x80xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x80x1024xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x1024x1024xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 1024, 80], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x1024x80xf16>> -> tensor<16x1024x80xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 80, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x80x1024xf16>> -> tensor<16x80x1024xf16>
+ %5 = tensor.empty() : tensor<16x1024x1024xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16>
+ %7 = linalg.batch_matmul ins(%3, %4 : tensor<16x1024x80xf16>, tensor<16x80x1024xf16>) outs(%6 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [16, 1024, 1024], strides = [1, 1, 1] : tensor<16x1024x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<16x1024x1024xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 128], [1, 32, 64], [0, 0, 0, 16], [1, 16, 16, 16]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 0 : i64, store_stage = 0 : i64}>
-//CHECK-LABEL: hal.executable.export public @batch_matmul_16x1024x1024x80
-// CHECK-SAME: subgroup_size = 32 : index
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 0 : i64}>
// CHECK: func.func @batch_matmul_16x1024x1024x80()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.batch_matmul
// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
// Small K - not supported by cooperative matrix.
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable public @matmul_256x1024x8 {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64>
- >}>) {
- hal.executable.export public @matmul_256x1024x8 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_256x1024x8() {
- %c0 = arith.constant 0 : index
- %c1024 = arith.constant 1024 : index
- %c256 = arith.constant 256 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x8xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<8x1024xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x8xf16>> -> tensor<256x8xf16>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8x1024xf16>> -> tensor<8x1024xf16>
- %15 = tensor.empty() : tensor<256x1024xf16>
- %16 = linalg.fill ins(%cst : f16) outs(%15 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
- %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
- ins(%8, %10 : tensor<256x8xf16>, tensor<8x1024xf16>) outs(%16 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : tensor<256x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>]>>}>
+module {
+ func.func @matmul_256x1024x8() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c1024 = arith.constant 1024 : index
+ %c256 = arith.constant 256 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x8xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<8x1024xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x8xf16>> -> tensor<256x8xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8x1024xf16>> -> tensor<8x1024xf16>
+ %5 = tensor.empty() : tensor<256x1024xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
+ %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<256x8xf16>, tensor<8x1024xf16>) outs(%6 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
+ return
}
}
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
-// CHECK-LABEL: hal.executable.export public @matmul_256x1024x8
-// CHECK-NOT: subgroup_size =
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-NOT: subgroup_size =
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [32, 8, 1], {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
+// CHECK-LABEL: func.func @matmul_256x1024x8
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir
index d77a064..7a14111 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir
@@ -1,117 +1,62 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
// Convolution with consumer pointwise ops
-
-#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
-#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 112)>
-#map2 = affine_map<(d0)[s0] -> (s0, -d0 + 32)>
-#map3 = affine_map<(d0) -> (d0 * 2)>
-#map4 = affine_map<(d0, d1) -> (d0 * 2 + 1, d1 * -2 + 225)>
-#map5 = affine_map<(d0)[s0] -> (-d0 + 32, s0)>
-#map6 = affine_map<(d0)[s0] -> (-d0 + 112, s0)>
-#map7 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable private @nhwc_conv_pointwise_112x112x32 {
- hal.executable.variant public @vulkan_spirv_fb target(#hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export public @nhwc_conv_pointwise_112x112x32 layout(#pipeline_layout)
- builtin.module {
- func.func @nhwc_conv_pointwise_112x112x32() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %c112 = arith.constant 112 : index
- %c32 = arith.constant 32 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x112x112x32xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x3x32xf32>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x32xf32>>
- %13 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x112x112x32xf32>> -> tensor<1x112x112x32xf32>
- %14 = tensor.empty() : tensor<1x112x112x32xf32>
- %19 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>> -> tensor<1x225x225x3xf32>
- %21 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x3x32xf32>> -> tensor<3x3x3x32xf32>
- %24 = tensor.empty() : tensor<1x112x112x32xf32>
- %25 = linalg.fill ins(%cst : f32) outs(%24 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
- %26 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%19, %21 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%25 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
- %27 = linalg.generic {indexing_maps = [#map7, #map7, #map7], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
- ins(%26, %13 : tensor<1x112x112x32xf32>, tensor<1x112x112x32xf32>) outs(%14 : tensor<1x112x112x32xf32>) {
- ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors
- %28 = arith.subf %arg3, %arg4 : f32
- linalg.yield %28 : f32
- } -> tensor<1x112x112x32xf32>
- flow.dispatch.tensor.store %27, %3, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1]
- : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x32xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+module {
+ func.func @nhwc_conv_pointwise_112x112x32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %c112 = arith.constant 112 : index
+ %c32 = arith.constant 32 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x112x112x32xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x3x32xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x32xf32>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x112x112x32xf32>> -> tensor<1x112x112x32xf32>
+ %5 = tensor.empty() : tensor<1x112x112x32xf32>
+ %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>> -> tensor<1x225x225x3xf32>
+ %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x3x32xf32>> -> tensor<3x3x3x32xf32>
+ %8 = tensor.empty() : tensor<1x112x112x32xf32>
+ %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
+ %10 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%6, %7 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%9 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
+ %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %4 : tensor<1x112x112x32xf32>, tensor<1x112x112x32xf32>) outs(%5 : tensor<1x112x112x32xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %12 = arith.subf %in, %in_0 : f32
+ linalg.yield %12 : f32
+ } -> tensor<1x112x112x32xf32>
+ flow.dispatch.tensor.store %11, %3, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x32xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 4, 4, 32], [1, 2, 2, 4], [0, 0, 0, 0, 1, 1, 4], [0, 1, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @nhwc_conv_pointwise_112x112x32
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [8 : index, 2 : index, 2 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 2, 2]>
// CHECK: func.func @nhwc_conv_pointwise_112x112x32()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.conv_2d_nhwc_hwcf
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-hal.executable private @nchw_conv_2x1280x8x8 {
- hal.executable.variant public @vulkan_spirv_fb target(#hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export public @nchw_conv_2x1280x8x8 layout(#pipeline_layout)
- builtin.module {
- func.func @nchw_conv_2x1280x8x8() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x1280x10x10xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1280x1280x3x3xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<2x1280x8x8xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 1280, 10, 10], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1280x10x10xf32>> -> tensor<2x1280x10x10xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1280, 1280, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1280x1280x3x3xf32>> -> tensor<1280x1280x3x3xf32>
- %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 1280, 8, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x1280x8x8xf32>> -> tensor<2x1280x8x8xf32>
- %6 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%3, %4 : tensor<2x1280x10x10xf32>, tensor<1280x1280x3x3xf32>) outs(%5 : tensor<2x1280x8x8xf32>) -> tensor<2x1280x8x8xf32>
- flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 1280, 8, 8], strides = [1, 1, 1, 1] : tensor<2x1280x8x8xf32> -> !flow.dispatch.tensor<readwrite:tensor<2x1280x8x8xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+module {
+ func.func @nchw_conv_2x1280x8x8() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x1280x10x10xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1280x1280x3x3xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<2x1280x8x8xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 1280, 10, 10], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1280x10x10xf32>> -> tensor<2x1280x10x10xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1280, 1280, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1280x1280x3x3xf32>> -> tensor<1280x1280x3x3xf32>
+ %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 1280, 8, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<2x1280x8x8xf32>> -> tensor<2x1280x8x8xf32>
+ %6 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%3, %4 : tensor<2x1280x10x10xf32>, tensor<1280x1280x3x3xf32>) outs(%5 : tensor<2x1280x8x8xf32>) -> tensor<2x1280x8x8xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 1280, 8, 8], strides = [1, 1, 1, 1] : tensor<2x1280x8x8xf32> -> !flow.dispatch.tensor<readwrite:tensor<2x1280x8x8xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 16, 8, 8], [1, 8, 1, 4], [0, 0, 0, 0, 4, 1, 1], [0, 0, 1, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @nchw_conv_2x1280x8x8
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [2 : index, 8 : index, 2 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [2, 8, 2]>
// CHECK: func.func @nchw_conv_2x1280x8x8()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.conv_2d_nchw_fchw
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir
index 5096234..30fd664 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir
@@ -1,186 +1,109 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @static_1d_sort {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @static_1d_sort layout(#pipeline_layout)
- builtin.module {
- func.func @static_1d_sort() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<1000xi32>>
- %1 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1000], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<1000xi32>> -> tensor<1000xi32>
- %2 = iree_linalg_ext.sort {__internal_linalg_transform__ = "workgroup"} dimension(0) outs(%1 : tensor<1000xi32>) {
- ^bb0(%arg0: i32, %arg1: i32): // no predecessors
- %3 = arith.cmpi slt, %arg0, %arg1 : i32
- iree_linalg_ext.yield %3 : i1
- } -> tensor<1000xi32>
- flow.dispatch.tensor.store %2, %0, offsets = [0], sizes = [1000], strides = [1] : tensor<1000xi32> -> !flow.dispatch.tensor<readwrite:tensor<1000xi32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @static_1d_sort() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<1000xi32>>
+ %1 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1000], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<1000xi32>> -> tensor<1000xi32>
+ %2 = iree_linalg_ext.sort {__internal_linalg_transform__ = "workgroup"} dimension(0) outs(%1 : tensor<1000xi32>) {
+ ^bb0(%arg0: i32, %arg1: i32):
+ %3 = arith.cmpi slt, %arg0, %arg1 : i32
+ iree_linalg_ext.yield %3 : i1
+ } -> tensor<1000xi32>
+ flow.dispatch.tensor.store %2, %0, offsets = [0], sizes = [1000], strides = [1] : tensor<1000xi32> -> !flow.dispatch.tensor<readwrite:tensor<1000xi32>>
+ return
}
}
// Check that the workgroup count and size are (1, 1, 1) for serializing the computation.
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = []>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute>
-// CHECK: hal.executable.export public @static_1d_sort
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [1 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [1, 1, 1]>
// CHECK: func.func @static_1d_sort()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: iree_linalg_ext.sort
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @static_3d_sort {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @static_3d_sort layout(#pipeline_layout)
- builtin.module {
- func.func @static_3d_sort() {
- %c64 = arith.constant 64 : index
- %c128 = arith.constant 128 : index
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x32x128xi32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<64x32x128xi32>
- linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%0 : memref<64x32x128xi32>) outs(%1 : memref<64x32x128xi32>) {
- ^bb0(%arg4: i32, %s: i32): // no predecessors
- linalg.yield %arg4 : i32
- }
- iree_linalg_ext.sort {__internal_linalg_transform__ = "workgroup"} dimension(1) outs(%1 : memref<64x32x128xi32>) {
- ^bb0(%arg2: i32, %arg3: i32): // no predecessors
- %11 = arith.cmpi slt, %arg2, %arg3 : i32
- iree_linalg_ext.yield %11 : i1
- }
- return
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+module {
+ func.func @static_3d_sort() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c64 = arith.constant 64 : index
+ %c128 = arith.constant 128 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x32x128xi32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<64x32x128xi32>
+ linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : memref<64x32x128xi32>) outs(%1 : memref<64x32x128xi32>) {
+ ^bb0(%in: i32, %out: i32):
+ linalg.yield %in : i32
}
+ iree_linalg_ext.sort {__internal_linalg_transform__ = "workgroup"} dimension(1) outs(%1 : memref<64x32x128xi32>) {
+ ^bb0(%arg0: i32, %arg1: i32):
+ %2 = arith.cmpi slt, %arg0, %arg1 : i32
+ iree_linalg_ext.yield %2 : i1
+ }
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 0, 16], [1, 0, 1]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute>
-// CHECK: hal.executable.export public @static_3d_sort
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [16 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [16, 1, 1]>
// CHECK: func.func @static_3d_sort()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: iree_linalg_ext.sort
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @static_1d_fft_stage2 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirvfb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @static_1d_fft_stage2 layout(#pipeline_layout)
- builtin.module {
- func.func @static_1d_fft_stage2() {
- %c0 = arith.constant 0 : index
- %c2 = arith.constant 2 : index
- %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32>
- %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32>
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
- %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
- %4:2 = iree_linalg_ext.fft {__internal_linalg_transform__ = "workgroup"} ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32>
- flow.dispatch.tensor.store %4#0, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- flow.dispatch.tensor.store %4#1, %1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- return
- }
- }
+#executable_target_vulkan_spirvfb = #hal.executable.target<"vulkan-spirv", "vulkan-spirvfb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @static_1d_fft_stage2() attributes {hal.executable.target = #executable_target_vulkan_spirvfb} {
+ %c0 = arith.constant 0 : index
+ %c2 = arith.constant 2 : index
+ %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32>
+ %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32>
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
+ %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
+ %4:2 = iree_linalg_ext.fft {__internal_linalg_transform__ = "workgroup"} ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32>
+ flow.dispatch.tensor.store %4#0, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ flow.dispatch.tensor.store %4#1, %1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute>
-// CHECK: hal.executable.export public @static_1d_fft_stage2
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [16 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [16, 1, 1]>
// CHECK: func.func @static_1d_fft_stage2()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: iree_linalg_ext.fft
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @static_3d_fft_stage3 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirvfb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @static_3d_fft_stage3 layout(#pipeline_layout)
- builtin.module {
- func.func @static_3d_fft_stage3() {
- %c0 = arith.constant 0 : index
- %c3 = arith.constant 3 : index
- %c64 = arith.constant 64 : index
- %c128 = arith.constant 128 : index
- %c32 = arith.constant 32 : index
- %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32>
- %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32>
- %0 = bufferization.to_memref %cst_0 : memref<4xf32>
- %1 = bufferization.to_memref %cst : memref<4xf32>
- %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x128x32xf32>
- %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<64x128x32xf32>
- iree_linalg_ext.fft {__internal_linalg_transform__ = "workgroup"}
- ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>)
- outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>)
- return
- }
- }
+#executable_target_vulkan_spirvfb = #hal.executable.target<"vulkan-spirv", "vulkan-spirvfb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @static_3d_fft_stage3() attributes {hal.executable.target = #executable_target_vulkan_spirvfb} {
+ %c0 = arith.constant 0 : index
+ %c3 = arith.constant 3 : index
+ %c64 = arith.constant 64 : index
+ %c128 = arith.constant 128 : index
+ %c32 = arith.constant 32 : index
+ %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32>
+ %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32>
+ %0 = bufferization.to_memref %cst_0 : memref<4xf32>
+ %1 = bufferization.to_memref %cst : memref<4xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x128x32xf32>
+ %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<64x128x32xf32>
+ iree_linalg_ext.fft {__internal_linalg_transform__ = "workgroup"} ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>) outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>)
+ return
}
}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 8]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute>
-// CHECK: hal.executable.export public @static_3d_fft_stage3
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [16 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [16, 1, 1]>
// CHECK: func.func @static_3d_fft_stage3()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: iree_linalg_ext.fft
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir
index 173732d..5cc86b6 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir
@@ -1,245 +1,145 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 2, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @copy_as_generic {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @copy_as_generic layout(#pipeline_layout)
- builtin.module {
- func.func @copy_as_generic() {
- %c0 = arith.constant 0 : index
- %d0 = hal.interface.constant.load[0] : index
- %d1 = hal.interface.constant.load[1] : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<?x?xi32>{%d0, %d1}
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<?x?xi32>{%d0, %d1}
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%0 : memref<?x?xi32>) outs(%1 : memref<?x?xi32>) {
- ^bb0(%arg4: i32, %s: i32): // no predecessors
- linalg.yield %arg4 : i32
- }
- return
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @copy_as_generic() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<?x?xi32>{%0, %1}
+ %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<?x?xi32>{%0, %1}
+ linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<?x?xi32>) outs(%3 : memref<?x?xi32>) {
+ ^bb0(%in: i32, %out: i32):
+ linalg.yield %in : i32
}
+ return
}
}
-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 16], [1, 1]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute>
-// CHECK: hal.executable.export public @copy_as_generic
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [16, 1, 1]>
+// CHECK: func.func @copy_as_generic()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @tensor_insert {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @copy layout(#pipeline_layout)
- builtin.module {
- func.func @copy() {
- %c0 = arith.constant 0 : index
- %c224 = arith.constant 224 : index
- %c3 = arith.constant 3 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1x224x224x3xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<1x224x224x3xf32>
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
- iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
- ins(%0 : memref<1x224x224x3xf32>) outs(%1 : memref<1x224x224x3xf32>) {
- ^bb0(%arg4: f32, %s: f32): // no predecessors
- linalg.yield %arg4 : f32
- }
- return
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+module {
+ func.func @copy() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c224 = arith.constant 224 : index
+ %c3 = arith.constant 3 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1x224x224x3xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<1x224x224x3xf32>
+ linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : memref<1x224x224x3xf32>) outs(%1 : memref<1x224x224x3xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
}
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 2, 32, 1], [0, 1, 1, 1]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute>
-// CHECK: hal.executable.export public @copy
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [1, 32, 2]>
+// CHECK: func.func @copy()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Average pooling op with nice tilable input.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @avg_pool {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 32>>
- }>) {
- hal.executable.export public @avg_pool layout(#pipeline_layout)
- builtin.module {
- func.func @avg_pool() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %c2 = arith.constant 2 : index
- %c8 = arith.constant 8 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x24x24x8xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x2x2x8xf32>>
- %2 = tensor.empty() : tensor<12x12xf32>
- %14 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 24, 24, 8], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x24x24x8xf32>> -> tensor<1x24x24x8xf32>
- %20 = tensor.empty() : tensor<1x2x2x8xf32>
- %21 = linalg.fill ins(%cst : f32) outs(%20 : tensor<1x2x2x8xf32>) -> tensor<1x2x2x8xf32>
- %22 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<12> : vector<2xi64>}
- ins(%14, %2 : tensor<1x24x24x8xf32>, tensor<12x12xf32>)
- outs(%21 : tensor<1x2x2x8xf32>) -> tensor<1x2x2x8xf32>
- flow.dispatch.tensor.store %22, %1, offsets = [0, 0, 0, 0], sizes = [1, 2, 2, 8], strides = [1, 1, 1, 1]
- : tensor<1x2x2x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x2x2x8xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512]>>}>
+module {
+ func.func @avg_pool() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %c2 = arith.constant 2 : index
+ %c8 = arith.constant 8 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x24x24x8xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x2x2x8xf32>>
+ %2 = tensor.empty() : tensor<12x12xf32>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 24, 24, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x24x24x8xf32>> -> tensor<1x24x24x8xf32>
+ %4 = tensor.empty() : tensor<1x2x2x8xf32>
+ %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x2x2x8xf32>) -> tensor<1x2x2x8xf32>
+ %6 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<12> : vector<2xi64>} ins(%3, %2 : tensor<1x24x24x8xf32>, tensor<12x12xf32>) outs(%5 : tensor<1x2x2x8xf32>) -> tensor<1x2x2x8xf32>
+ flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0], sizes = [1, 2, 2, 8], strides = [1, 1, 1, 1] : tensor<1x2x2x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x2x2x8xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 2, 2, 8], [1, 1, 1, 4], [0, 0, 0, 0, 1, 1], [0, 1, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @avg_pool
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [2, 2, 2]>
+// CHECK: func.func @avg_pool()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.pooling_nhwc_sum
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @avg_pool {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 4>>
- }>) {
- hal.executable.export public @avg_pool layout(#pipeline_layout)
- builtin.module {
- func.func @avg_pool() {
- %cst = arith.constant 0.000000e+00 : f32
- %cst_0 = arith.constant 4.900000e+01 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x7x7x1280xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1x1x1280xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 7, 7, 1280], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x7x7x1280xf32>> -> tensor<1x7x7x1280xf32>
- %3 = tensor.empty() : tensor<7x7xf32>
- %4 = tensor.empty() : tensor<1x1x1x1280xf32>
- %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x1x1x1280xf32>) -> tensor<1x1x1x1280xf32>
- %6 = linalg.pooling_nhwc_sum {
- dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>
- } ins(%2, %3 : tensor<1x7x7x1280xf32>, tensor<7x7xf32>) outs(%5 : tensor<1x1x1x1280xf32>) -> tensor<1x1x1x1280xf32>
- %7 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
- iterator_types = ["parallel", "parallel", "parallel", "parallel"]
- } ins(%6 : tensor<1x1x1x1280xf32>) outs(%4 : tensor<1x1x1x1280xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %8 = arith.divf %arg0, %cst_0 : f32
- linalg.yield %8 : f32
- } -> tensor<1x1x1x1280xf32>
- flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 1280], strides = [1, 1, 1, 1]
- : tensor<1x1x1x1280xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x1x1x1280xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 4>>}>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+module {
+ func.func @avg_pool() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %cst_0 = arith.constant 4.900000e+01 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x7x7x1280xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1x1x1280xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 7, 7, 1280], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x7x7x1280xf32>> -> tensor<1x7x7x1280xf32>
+ %3 = tensor.empty() : tensor<7x7xf32>
+ %4 = tensor.empty() : tensor<1x1x1x1280xf32>
+ %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x1x1x1280xf32>) -> tensor<1x1x1x1280xf32>
+ %6 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%2, %3 : tensor<1x7x7x1280xf32>, tensor<7x7xf32>) outs(%5 : tensor<1x1x1x1280xf32>) -> tensor<1x1x1x1280xf32>
+ %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<1x1x1x1280xf32>) outs(%4 : tensor<1x1x1x1280xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %8 = arith.divf %in, %cst_0 : f32
+ linalg.yield %8 : f32
+ } -> tensor<1x1x1x1280xf32>
+ flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 1280], strides = [1, 1, 1, 1] : tensor<1x1x1x1280xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x1x1x1280xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 1, 1, 128], [1, 1, 1, 4], [0, 0, 0, 0, 1, 1], [0, 1, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @avg_pool
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [32, 1, 1]>
+// CHECK: func.func @avg_pool()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.pooling_nhwc_sum
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Max pooling op with odd size-1 dimension sizes.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @max_pool {
- hal.executable.variant @vulkan_spirv_fb target(#hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 32>>
- }>) {
- hal.executable.export public @max_pool layout(#pipeline_layout)
- builtin.module {
- func.func @max_pool() {
- %cst = arith.constant 0xFF800000 : f32
- %c38 = arith.constant 38 : index
- %c1 = arith.constant 1 : index
- %c0 = arith.constant 0 : index
- %c320 = arith.constant 320 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x76x1x1xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x38x1x1xf32>>
- %2 = tensor.empty() : tensor<2x1xf32>
- %13 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 76, 1, 1], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x76x1x1xf32>> -> tensor<1x76x1x1xf32>
- %18 = tensor.empty() : tensor<1x38x1x1xf32>
- %19 = linalg.fill ins(%cst : f32) outs(%18 : tensor<1x38x1x1xf32>) -> tensor<1x38x1x1xf32>
- %20 = linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<[2, 1]> : vector<2xi64>}
- ins(%13, %2 : tensor<1x76x1x1xf32>, tensor<2x1xf32>)
- outs(%19 : tensor<1x38x1x1xf32>) -> tensor<1x38x1x1xf32>
- flow.dispatch.tensor.store %20, %1, offsets = [0, 0, 0, 0], sizes = [1, 38, 1, 1], strides = [1, 1, 1, 1]
- : tensor<1x38x1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x38x1x1xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512]>>}>
+module {
+ func.func @max_pool() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %cst = arith.constant 0xFF800000 : f32
+ %c38 = arith.constant 38 : index
+ %c1 = arith.constant 1 : index
+ %c0 = arith.constant 0 : index
+ %c320 = arith.constant 320 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x76x1x1xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x38x1x1xf32>>
+ %2 = tensor.empty() : tensor<2x1xf32>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 76, 1, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x76x1x1xf32>> -> tensor<1x76x1x1xf32>
+ %4 = tensor.empty() : tensor<1x38x1x1xf32>
+ %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x38x1x1xf32>) -> tensor<1x38x1x1xf32>
+ %6 = linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<[2, 1]> : vector<2xi64>} ins(%3, %2 : tensor<1x76x1x1xf32>, tensor<2x1xf32>) outs(%5 : tensor<1x38x1x1xf32>) -> tensor<1x38x1x1xf32>
+ flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0], sizes = [1, 38, 1, 1], strides = [1, 1, 1, 1] : tensor<1x38x1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x38x1x1xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 32], [0, 1]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute>
-// CHECK: hal.executable.export public @max_pool
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [32, 1, 1]>
+// CHECK: func.func @max_pool()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.pooling_nhwc_max
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -247,443 +147,279 @@
// Element wise op with mismatched input and output rank.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-hal.executable @elementwise {
- hal.executable.variant @vulkan_spirv_fb target(#hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 32>>
- }>) {
- hal.executable.export public @elementwise layout(#pipeline_layout)
- builtin.module {
- func.func @elementwise() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : index
- %c10 = arith.constant 10 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x10xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<10xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<10xf32>>
- %9 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 10], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x10xf32>> -> tensor<1x10xf32>
- %10 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [10], strides = [1]
- : !flow.dispatch.tensor<readonly:tensor<10xf32>> -> tensor<10xf32>
- %11 = tensor.empty() : tensor<10xf32>
- %12 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%9, %10 : tensor<1x10xf32>, tensor<10xf32>) outs(%11 : tensor<10xf32>) {
- ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
- %13 = arith.addf %arg2, %arg3 : f32
- linalg.yield %13 : f32
- } -> tensor<10xf32>
- flow.dispatch.tensor.store %12, %2, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor<writeonly:tensor<10xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512]>>}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d1)>
+module {
+ func.func @elementwise() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c10 = arith.constant 10 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x10xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<10xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<10xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x10xf32>> -> tensor<1x10xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor<readonly:tensor<10xf32>> -> tensor<10xf32>
+ %5 = tensor.empty() : tensor<10xf32>
+ %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%3, %4 : tensor<1x10xf32>, tensor<10xf32>) outs(%5 : tensor<10xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %7 = arith.addf %in, %in_0 : f32
+ linalg.yield %7 : f32
+ } -> tensor<10xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor<writeonly:tensor<10xf32>>
+ return
}
}
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute>
-// CHECK: hal.executable.export public @elementwise
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [32, 1, 1]>
+// CHECK: func.func @elementwise()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// -----
// Fused depthwise convolution and element wise ops: don't vectorize with partially active subgroups.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-#map22 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
-
-hal.executable @dwconv_elementwise {
- hal.executable.variant @vulkan_spirv_fb target(#hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 32>>
- }>) {
- hal.executable.export public @dwconv_elementwise layout(#pipeline_layout)
- builtin.module {
- func.func @dwconv_elementwise() {
- %cst = arith.constant dense_resource<__elided__> : tensor<3x3x1x4xf32>
- %cst_8 = arith.constant 1.001000e+00 : f32
- %cst_9 = arith.constant 0.000000e+00 : f32
- %c18 = arith.constant 18 : index
- %c1 = arith.constant 1 : index
- %c4 = arith.constant 4 : index
- %c4576 = arith.constant 4576 : index
- %c6272 = arith.constant 6272 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x21x20x1xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x19x18x1x4xf32>>
- %11 = tensor.empty() : tensor<1x19x18x1x4xf32>
- %14 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 21, 20, 1], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x21x20x1xf32>> -> tensor<1x21x20x1xf32>
- %18 = tensor.empty() : tensor<1x19x18x1x4xf32>
- %19 = linalg.fill ins(%cst_9 : f32) outs(%18 : tensor<1x19x18x1x4xf32>) -> tensor<1x19x18x1x4xf32>
- %20 = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
- ins(%14, %cst : tensor<1x21x20x1xf32>, tensor<3x3x1x4xf32>) outs(%19 : tensor<1x19x18x1x4xf32>) -> tensor<1x19x18x1x4xf32>
- %21 = linalg.generic {
- indexing_maps = [#map22, #map22], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
- ins(%20 : tensor<1x19x18x1x4xf32>) outs(%11 : tensor<1x19x18x1x4xf32>) {
- ^bb0(%arg3: f32, %arg4: f32):
- %22 = math.sqrt %cst_8 : f32
- %23 = arith.addf %arg3, %cst_9 : f32
- linalg.yield %23 : f32
- } -> tensor<1x19x18x1x4xf32>
- flow.dispatch.tensor.store %21, %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 19, 18, 1, 4], strides = [1, 1, 1, 1, 1]
- : tensor<1x19x18x1x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x19x18x1x4xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512]>>}>
+#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
+module {
+ func.func @dwconv_elementwise() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %cst = arith.constant dense_resource<__elided__> : tensor<3x3x1x4xf32>
+ %cst_0 = arith.constant 1.001000e+00 : f32
+ %cst_1 = arith.constant 0.000000e+00 : f32
+ %c18 = arith.constant 18 : index
+ %c1 = arith.constant 1 : index
+ %c4 = arith.constant 4 : index
+ %c4576 = arith.constant 4576 : index
+ %c6272 = arith.constant 6272 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x21x20x1xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x19x18x1x4xf32>>
+ %2 = tensor.empty() : tensor<1x19x18x1x4xf32>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 21, 20, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x21x20x1xf32>> -> tensor<1x21x20x1xf32>
+ %4 = tensor.empty() : tensor<1x19x18x1x4xf32>
+ %5 = linalg.fill ins(%cst_1 : f32) outs(%4 : tensor<1x19x18x1x4xf32>) -> tensor<1x19x18x1x4xf32>
+ %6 = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %cst : tensor<1x21x20x1xf32>, tensor<3x3x1x4xf32>) outs(%5 : tensor<1x19x18x1x4xf32>) -> tensor<1x19x18x1x4xf32>
+ %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<1x19x18x1x4xf32>) outs(%2 : tensor<1x19x18x1x4xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %8 = math.sqrt %cst_0 : f32
+ %9 = arith.addf %in, %cst_1 : f32
+ linalg.yield %9 : f32
+ } -> tensor<1x19x18x1x4xf32>
+ flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 19, 18, 1, 4], strides = [1, 1, 1, 1, 1] : tensor<1x19x18x1x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x19x18x1x4xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 4, 2, 0, 4], [0, 1, 1, 0, 1]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute>
-// CHECK: hal.executable.export public @dwconv_elementwise
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [4, 2, 4]>
+// CHECK: func.func @dwconv_elementwise()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.depthwise_conv_2d_nhwc_hwcm
-// CHECK-SAME: lowering_config = #[[CONFIG]]
-
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-#map0 = affine_map<(d0, d1, d2) -> (d2, d0, d1)>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512]>>}>
+#map = affine_map<(d0, d1, d2) -> (d2, d0, d1)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
-
-hal.executable @outermost_reduction {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @outermost_reduction layout(#pipeline_layout)
- builtin.module {
- func.func @outermost_reduction() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4x2048x512xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 2048, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x2048x512xf32>> -> tensor<4x2048x512xf32>
- %3 = tensor.empty() : tensor<2048x512xf32>
- %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
- %5 = linalg.generic {
- indexing_maps = [#map0, #map1],
- iterator_types = ["parallel", "parallel", "reduction"]
- } ins(%2 : tensor<4x2048x512xf32>) outs(%4 : tensor<2048x512xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %6 = arith.addf %arg0, %arg1 : f32
- linalg.yield %6 : f32
- } -> tensor<2048x512xf32>
- flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
- return
- }
- }
+module {
+ func.func @outermost_reduction() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4x2048x512xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 2048, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x2048x512xf32>> -> tensor<4x2048x512xf32>
+ %3 = tensor.empty() : tensor<2048x512xf32>
+ %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
+ %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<4x2048x512xf32>) outs(%4 : tensor<2048x512xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %6 = arith.addf %in, %out : f32
+ linalg.yield %6 : f32
+ } -> tensor<2048x512xf32>
+ flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 128], [1, 4], [0, 0, 4]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK-LABEL: hal.executable.export public @outermost_reduction
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [32, 1, 1]>
+// CHECK: func.func @outermost_reduction()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
+
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512]>>}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1) -> (d0)>
-
-hal.executable private @innermost_reduction {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 32>>
- }>) {
- hal.executable.export public @innermost_reduction ordinal(0) layout(#pipeline_layout)
- builtin.module {
- func.func @innermost_reduction() {
- %cst = arith.constant -0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = arith.index_cast %0 {stream.alignment = 512 : index, stream.values = [0 : index, 394752 : index, 984064 : index]} : i32 to index
- %4 = arith.index_cast %1 {stream.alignment = 512 : index, stream.values = [0 : index, 196608 : index, 197120 : index]} : i32 to index
- %5 = arith.index_cast %2 {stream.alignment = 512 : index, stream.values = [512 : index, 197120 : index, 197632 : index]} : i32 to index
- %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor<readonly:tensor<128x384xf32>>
- %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) : !flow.dispatch.tensor<readonly:tensor<128xf32>>
- %8 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : !flow.dispatch.tensor<writeonly:tensor<128xf32>>
- %9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x384xf32>> -> tensor<128x384xf32>
- %10 = flow.dispatch.tensor.load %7, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xf32>> -> tensor<128xf32>
- %11 = tensor.empty() : tensor<128xf32>
- %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<128xf32>) -> tensor<128xf32>
- %13 = linalg.generic {
- indexing_maps = [#map0, #map1, #map1],
- iterator_types = ["parallel", "reduction"]
- } ins(%9, %10 : tensor<128x384xf32>, tensor<128xf32>) outs(%12 : tensor<128xf32>) {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
- %14 = arith.subf %arg0, %arg1 : f32
- %15 = arith.mulf %14, %14 : f32
- %16 = arith.addf %15, %arg2 : f32
- linalg.yield %16 : f32
- } -> tensor<128xf32>
- flow.dispatch.tensor.store %13, %8, offsets = [0], sizes = [128], strides = [1] : tensor<128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128xf32>>
- return
- }
- }
+module {
+ func.func @innermost_reduction() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %cst = arith.constant -0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = arith.index_cast %0 {stream.alignment = 512 : index, stream.values = [0 : index, 394752 : index, 984064 : index]} : i32 to index
+ %4 = arith.index_cast %1 {stream.alignment = 512 : index, stream.values = [0 : index, 196608 : index, 197120 : index]} : i32 to index
+ %5 = arith.index_cast %2 {stream.alignment = 512 : index, stream.values = [512 : index, 197120 : index, 197632 : index]} : i32 to index
+ %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor<readonly:tensor<128x384xf32>>
+ %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) : !flow.dispatch.tensor<readonly:tensor<128xf32>>
+ %8 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : !flow.dispatch.tensor<writeonly:tensor<128xf32>>
+ %9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x384xf32>> -> tensor<128x384xf32>
+ %10 = flow.dispatch.tensor.load %7, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xf32>> -> tensor<128xf32>
+ %11 = tensor.empty() : tensor<128xf32>
+ %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<128xf32>) -> tensor<128xf32>
+ %13 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "reduction"]} ins(%9, %10 : tensor<128x384xf32>, tensor<128xf32>) outs(%12 : tensor<128xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %14 = arith.subf %in, %in_0 : f32
+ %15 = arith.mulf %14, %14 : f32
+ %16 = arith.addf %15, %out : f32
+ linalg.yield %16 : f32
+ } -> tensor<128xf32>
+ flow.dispatch.tensor.store %13, %8, offsets = [0], sizes = [128], strides = [1] : tensor<128xf32> -> !flow.dispatch.tensor<writeonly:tensor<128xf32>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32], [1], [0, 4]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK-LABEL: hal.executable.export public @innermost_reduction
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [32, 1, 1]>
+// CHECK: func.func @innermost_reduction()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 2, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @four_dim_elementwise {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export public @four_dim_elementwise ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3, %arg4
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @four_dim_elementwise() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x8x256x4xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x256x4x8xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [128, 8, 256, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x8x256x4xf32>> -> tensor<128x8x256x4xf32>
- %3 = tensor.empty() : tensor<128x256x4x8xf32>
- %4 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
- iterator_types = ["parallel", "parallel", "parallel", "parallel"]
- } ins(%2 : tensor<128x8x256x4xf32>) outs(%3 : tensor<128x256x4x8xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- linalg.yield %arg0 : f32
- } -> tensor<128x256x4x8xf32>
- flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [128, 256, 4, 8], strides = [1, 1, 1, 1] : tensor<128x256x4x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x256x4x8xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+module {
+ func.func @four_dim_elementwise() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x8x256x4xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<128x256x4x8xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [128, 8, 256, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x8x256x4xf32>> -> tensor<128x8x256x4xf32>
+ %3 = tensor.empty() : tensor<128x256x4x8xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<128x8x256x4xf32>) outs(%3 : tensor<128x256x4x8xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<128x256x4x8xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [128, 256, 4, 8], strides = [1, 1, 1, 1] : tensor<128x256x4x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x256x4x8xf32>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 2, 4, 8], [0, 1, 1, 4]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK-LABEL: hal.executable.export public @four_dim_elementwise
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [2, 4, 2]>
+// CHECK: func.func @four_dim_elementwise()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-
-hal.executable private @odd_reduction_dimension_size_501 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 32>>
- }>) {
- hal.executable.export public @odd_reduction_dimension_size_501 ordinal(0) layout(#pipeline_layout)
- builtin.module {
- func.func @odd_reduction_dimension_size_501() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0xFF800000 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<512x501xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<512x501xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 501], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x501xf32>> -> tensor<512x501xf32>
- %3 = tensor.empty() : tensor<512x501xf32>
- %4 = tensor.empty() : tensor<512xf32>
- %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<512xf32>) -> tensor<512xf32>
- %6 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>],
- iterator_types = ["parallel", "reduction"]
- } ins(%2 : tensor<512x501xf32>) outs(%5 : tensor<512xf32>) {
- ^bb0(%in: f32, %out: f32):
- %8 = arith.maximumf %out, %in : f32
- linalg.yield %8 : f32
- } -> tensor<512xf32>
- %7 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]
- } ins(%2, %6 : tensor<512x501xf32>, tensor<512xf32>) outs(%3 : tensor<512x501xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %8 = arith.subf %in, %in_0 : f32
- %9 = math.exp %8 : f32
- linalg.yield %9 : f32
- } -> tensor<512x501xf32>
- flow.dispatch.tensor.store %7, %1, offsets = [0, 0], sizes = [512, 501], strides = [1, 1] : tensor<512x501xf32> -> !flow.dispatch.tensor<writeonly:tensor<512x501xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512]>>}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+module {
+ func.func @odd_reduction_dimension_size_501() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0xFF800000 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<512x501xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<512x501xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 501], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x501xf32>> -> tensor<512x501xf32>
+ %3 = tensor.empty() : tensor<512x501xf32>
+ %4 = tensor.empty() : tensor<512xf32>
+ %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<512xf32>) -> tensor<512xf32>
+ %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<512x501xf32>) outs(%5 : tensor<512xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %8 = arith.maximumf %out, %in : f32
+ linalg.yield %8 : f32
+ } -> tensor<512xf32>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<512x501xf32>, tensor<512xf32>) outs(%3 : tensor<512x501xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %8 = arith.subf %in, %in_0 : f32
+ %9 = math.exp %8 : f32
+ linalg.yield %9 : f32
+ } -> tensor<512x501xf32>
+ flow.dispatch.tensor.store %7, %1, offsets = [0, 0], sizes = [512, 501], strides = [1, 1] : tensor<512x501xf32> -> !flow.dispatch.tensor<writeonly:tensor<512x501xf32>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128], [4], [0, 3]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK-LABEL: hal.executable.export public @odd_reduction_dimension_size_501
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [32, 1, 1]>
+// CHECK: func.func @odd_reduction_dimension_size_501()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-
-hal.executable private @odd_reduction_dimension_size_2809 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 32>>
- }>) {
- hal.executable.export public @odd_reduction_dimension_size_2809 ordinal(0) layout(#pipeline_layout)
- builtin.module {
- func.func @odd_reduction_dimension_size_2809() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0xFF800000 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<512x2809xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<512x2809xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 2809], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2809xf32>> -> tensor<512x2809xf32>
- %3 = tensor.empty() : tensor<512x2809xf32>
- %4 = tensor.empty() : tensor<512xf32>
- %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<512xf32>) -> tensor<512xf32>
- %6 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>],
- iterator_types = ["parallel", "reduction"]
- } ins(%2 : tensor<512x2809xf32>) outs(%5 : tensor<512xf32>) {
- ^bb0(%in: f32, %out: f32):
- %8 = arith.maximumf %out, %in : f32
- linalg.yield %8 : f32
- } -> tensor<512xf32>
- %7 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]
- } ins(%2, %6 : tensor<512x2809xf32>, tensor<512xf32>) outs(%3 : tensor<512x2809xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %8 = arith.subf %in, %in_0 : f32
- %9 = math.exp %8 : f32
- linalg.yield %9 : f32
- } -> tensor<512x2809xf32>
- flow.dispatch.tensor.store %7, %1, offsets = [0, 0], sizes = [512, 2809], strides = [1, 1] : tensor<512x2809xf32> -> !flow.dispatch.tensor<writeonly:tensor<512x2809xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512]>>}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+module {
+ func.func @odd_reduction_dimension_size_2809() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0xFF800000 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<512x2809xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<512x2809xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 2809], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2809xf32>> -> tensor<512x2809xf32>
+ %3 = tensor.empty() : tensor<512x2809xf32>
+ %4 = tensor.empty() : tensor<512xf32>
+ %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<512xf32>) -> tensor<512xf32>
+ %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<512x2809xf32>) outs(%5 : tensor<512xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %8 = arith.maximumf %out, %in : f32
+ linalg.yield %8 : f32
+ } -> tensor<512xf32>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<512x2809xf32>, tensor<512xf32>) outs(%3 : tensor<512x2809xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %8 = arith.subf %in, %in_0 : f32
+ %9 = math.exp %8 : f32
+ linalg.yield %9 : f32
+ } -> tensor<512x2809xf32>
+ flow.dispatch.tensor.store %7, %1, offsets = [0, 0], sizes = [512, 2809], strides = [1, 1] : tensor<512x2809xf32> -> !flow.dispatch.tensor<writeonly:tensor<512x2809xf32>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128], [4], [0, 1]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK-LABEL: hal.executable.export public @odd_reduction_dimension_size_2809
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [32, 1, 1]>
+// CHECK: func.func @odd_reduction_dimension_size_2809()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-
-hal.executable private @broadcast {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 32>>
- }>) {
- hal.executable.export public @broadcast ordinal(0) layout(#pipeline_layout)
- builtin.module {
- func.func @broadcast() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 1.000000e-10 : f32
- %cst_0 = arith.constant -1.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<f32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x1x1x1xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
- %3 = tensor.empty() : tensor<2048x1x1x1xf32>
- %4 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
- iterator_types = ["parallel", "parallel", "parallel", "parallel"]
- } ins(%2 : tensor<f32>) outs(%3 : tensor<2048x1x1x1xf32>) {
- ^bb0(%in: f32, %out: f32):
- %5 = arith.maximumf %in, %cst : f32
- %6 = arith.divf %cst_0, %5 : f32
- linalg.yield %6 : f32
- } -> tensor<2048x1x1x1xf32>
- flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [2048, 1, 1, 1], strides = [1, 1, 1, 1] : tensor<2048x1x1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x1x1x1xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512]>>}>
+#map = affine_map<(d0, d1, d2, d3) -> ()>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+module {
+ func.func @broadcast() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 1.000000e-10 : f32
+ %cst_0 = arith.constant -1.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<f32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x1x1x1xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
+ %3 = tensor.empty() : tensor<2048x1x1x1xf32>
+ %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<f32>) outs(%3 : tensor<2048x1x1x1xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %5 = arith.maximumf %in, %cst : f32
+ %6 = arith.divf %cst_0, %5 : f32
+ linalg.yield %6 : f32
+ } -> tensor<2048x1x1x1xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [2048, 1, 1, 1], strides = [1, 1, 1, 1] : tensor<2048x1x1x1xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x1x1x1xf32>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128], [4], [0, 1, 1, 1]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK-LABEL: hal.executable.export public @broadcast
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [32, 1, 1]>
+// CHECK: func.func @broadcast()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir
index e6f33c0..a3c7c56 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir
@@ -1,369 +1,220 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
// Odd K that forbids vectorization.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @batch_matmul_1x3x32 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export public @batch_matmul_1x3x32 layout(#pipeline_layout)
- builtin.module {
- func.func @batch_matmul_1x3x32() {
- %c0 = arith.constant 0 : index
- %c32 = arith.constant 32 : index
- %c3 = arith.constant 3 : index
- %c1 = arith.constant 1 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x3x3xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x3x32xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x3x32xf32>>
- %11 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 3, 3], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x3x3xf32>> -> tensor<1x3x3xf32>
- %14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 3, 32], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x3x32xf32>> -> tensor<1x3x32xf32>
- %21 = tensor.empty() : tensor<1x3x32xf32>
- %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<1x3x32xf32>) -> tensor<1x3x32xf32>
- %23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"}
- ins(%11, %14 : tensor<1x3x3xf32>, tensor<1x3x32xf32>) outs(%22 : tensor<1x3x32xf32>) -> tensor<1x3x32xf32>
- flow.dispatch.tensor.store %23, %2, offsets = [0, 0, 0], sizes = [1, 3, 32], strides = [1, 1, 1]
- : tensor<1x3x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x3x32xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+module {
+ func.func @batch_matmul_1x3x32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c32 = arith.constant 32 : index
+ %c3 = arith.constant 3 : index
+ %c1 = arith.constant 1 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x3x3xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x3x32xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x3x32xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 3, 3], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x3x3xf32>> -> tensor<1x3x3xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 3, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x3x32xf32>> -> tensor<1x3x32xf32>
+ %5 = tensor.empty() : tensor<1x3x32xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x3x32xf32>) -> tensor<1x3x32xf32>
+ %7 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<1x3x3xf32>, tensor<1x3x32xf32>) outs(%6 : tensor<1x3x32xf32>) -> tensor<1x3x32xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [1, 3, 32], strides = [1, 1, 1] : tensor<1x3x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x3x32xf32>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 1, 32], [0, 1, 1]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute>
-// CHECK-LABEL: hal.executable.export public @batch_matmul_1x3x32
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [32, 1, 1]>
// CHECK: func.func @batch_matmul_1x3x32()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.batch_matmul
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
// 8-bit integers can be vectorized.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_64x16xi8 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export public @matmul_64x16xi8 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_64x16xi8() {
- %c0 = arith.constant 0 : index
- %c16 = arith.constant 16 : index
- %c64 = arith.constant 64 : index
- %c0_i32 = arith.constant 0 : i32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x32xi8>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<32x16xi8>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<64x16xi32>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<64x32xi8>> -> tensor<64x32xi8>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 16], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<32x16xi8>> -> tensor<32x16xi8>
- %15 = tensor.empty() : tensor<64x16xi32>
- %16 = linalg.fill ins(%c0_i32 : i32) outs(%15 : tensor<64x16xi32>) -> tensor<64x16xi32>
- %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
- ins(%8, %10 : tensor<64x32xi8>, tensor<32x16xi8>) outs(%16 : tensor<64x16xi32>) -> tensor<64x16xi32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [64, 16], strides = [1, 1]
- : tensor<64x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<64x16xi32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64>>}>
+module {
+ func.func @matmul_64x16xi8() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c16 = arith.constant 16 : index
+ %c64 = arith.constant 64 : index
+ %c0_i32 = arith.constant 0 : i32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x32xi8>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<32x16xi8>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<64x16xi32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x32xi8>> -> tensor<64x32xi8>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x16xi8>> -> tensor<32x16xi8>
+ %5 = tensor.empty() : tensor<64x16xi32>
+ %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<64x16xi32>) -> tensor<64x16xi32>
+ %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<64x32xi8>, tensor<32x16xi8>) outs(%6 : tensor<64x16xi32>) -> tensor<64x16xi32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 16], strides = [1, 1] : tensor<64x16xi32> -> !flow.dispatch.tensor<writeonly:tensor<64x16xi32>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 16], [2, 8], [0, 0, 8]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK-LABEL: hal.executable.export public @matmul_64x16xi8
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [2 : index, 32 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [2, 32, 1]>
// CHECK: func.func @matmul_64x16xi8()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
// Vectorize non-32 bit types.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_64x16xi64 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, Int64], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export public @matmul_64x16xi64 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_64x16xi64() {
- %c0 = arith.constant 0 : index
- %c16 = arith.constant 16 : index
- %c64 = arith.constant 64 : index
- %c0_i32 = arith.constant 0 : i32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x32xi64>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<32x16xi64>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<64x16xi64>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<64x32xi64>> -> tensor<64x32xi64>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 16], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<32x16xi64>> -> tensor<32x16xi64>
- %15 = tensor.empty() : tensor<64x16xi64>
- %16 = linalg.fill ins(%c0_i32 : i32) outs(%15 : tensor<64x16xi64>) -> tensor<64x16xi64>
- %17 = linalg.matmul
- ins(%8, %10 : tensor<64x32xi64>, tensor<32x16xi64>) outs(%16 : tensor<64x16xi64>) -> tensor<64x16xi64>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [64, 16], strides = [1, 1]
- : tensor<64x16xi64> -> !flow.dispatch.tensor<writeonly:tensor<64x16xi64>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, Int64], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64>>}>
+module {
+ func.func @matmul_64x16xi64() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c16 = arith.constant 16 : index
+ %c64 = arith.constant 64 : index
+ %c0_i32 = arith.constant 0 : i32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x32xi64>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<32x16xi64>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<64x16xi64>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x32xi64>> -> tensor<64x32xi64>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x16xi64>> -> tensor<32x16xi64>
+ %5 = tensor.empty() : tensor<64x16xi64>
+ %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<64x16xi64>) -> tensor<64x16xi64>
+ %7 = linalg.matmul ins(%3, %4 : tensor<64x32xi64>, tensor<32x16xi64>) outs(%6 : tensor<64x16xi64>) -> tensor<64x16xi64>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 16], strides = [1, 1] : tensor<64x16xi64> -> !flow.dispatch.tensor<writeonly:tensor<64x16xi64>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 16], [1, 4], [0, 0, 4]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK-LABEL: hal.executable.export public @matmul_64x16xi64
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [4 : index, 16 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [4, 16, 1]>
// CHECK: func.func @matmul_64x16xi64()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
// Odd N that forbids vectorization.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_400x273 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export public @matmul_400x273 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_400x273() {
- %c0 = arith.constant 0 : index
- %c11775744 = arith.constant 11775744 : index
- %cst = arith.constant 0.000000e+00 : f32
- %c400 = arith.constant 400 : index
- %c273 = arith.constant 273 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c11775744) : !flow.dispatch.tensor<readonly:tensor<273xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<400x576xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<576x273xf32>>
- %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<400x273xf32>>
- %9 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [273], strides = [1] : !flow.dispatch.tensor<readonly:tensor<273xf32>> -> tensor<273xf32>
- %11 = tensor.empty() : tensor<400x273xf32>
- %13 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [400, 576], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<400x576xf32>> -> tensor<400x576xf32>
- %15 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [576, 273], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<576x273xf32>> -> tensor<576x273xf32>
- %16 = tensor.empty() : tensor<400x273xf32>
- %17 = linalg.fill ins(%cst : f32) outs(%16 : tensor<400x273xf32>) -> tensor<400x273xf32>
- %18 = linalg.matmul ins(%13, %15 : tensor<400x576xf32>, tensor<576x273xf32>) outs(%17 : tensor<400x273xf32>) -> tensor<400x273xf32>
- %19 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%9, %18 : tensor<273xf32>, tensor<400x273xf32>) outs(%11 : tensor<400x273xf32>) {
- ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
- %20 = arith.addf %arg2, %arg3 : f32
- linalg.yield %20 : f32
- } -> tensor<400x273xf32>
- flow.dispatch.tensor.store %19, %3, offsets = [0, 0], sizes = [400, 273], strides = [1, 1]
- : tensor<400x273xf32> -> !flow.dispatch.tensor<writeonly:tensor<400x273xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1) -> (d1)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @matmul_400x273() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c11775744 = arith.constant 11775744 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %c400 = arith.constant 400 : index
+ %c273 = arith.constant 273 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c11775744) : !flow.dispatch.tensor<readonly:tensor<273xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<400x576xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<576x273xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<400x273xf32>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [273], strides = [1] : !flow.dispatch.tensor<readonly:tensor<273xf32>> -> tensor<273xf32>
+ %5 = tensor.empty() : tensor<400x273xf32>
+ %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [400, 576], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<400x576xf32>> -> tensor<400x576xf32>
+ %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [576, 273], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<576x273xf32>> -> tensor<576x273xf32>
+ %8 = tensor.empty() : tensor<400x273xf32>
+ %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<400x273xf32>) -> tensor<400x273xf32>
+ %10 = linalg.matmul ins(%6, %7 : tensor<400x576xf32>, tensor<576x273xf32>) outs(%9 : tensor<400x273xf32>) -> tensor<400x273xf32>
+ %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%4, %10 : tensor<273xf32>, tensor<400x273xf32>) outs(%5 : tensor<400x273xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %12 = arith.addf %in, %in_0 : f32
+ linalg.yield %12 : f32
+ } -> tensor<400x273xf32>
+ flow.dispatch.tensor.store %11, %3, offsets = [0, 0], sizes = [400, 273], strides = [1, 1] : tensor<400x273xf32> -> !flow.dispatch.tensor<writeonly:tensor<400x273xf32>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[2, 32], [1, 1]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute>
-// CHECK-LABEL: hal.executable.export public @matmul_400x273
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [32, 2, 1]>
// CHECK: func.func @matmul_400x273()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
// Odd M and non-4-multiplier N
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_25x546 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 64>>
- }>) {
- hal.executable.export public @matmul_25x546 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_25x546() {
- %c0 = arith.constant 0 : index
- %c15842560 = arith.constant 15842560 : index
- %cst = arith.constant 0.000000e+00 : f32
- %c25 = arith.constant 25 : index
- %c546 = arith.constant 546 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c15842560) : !flow.dispatch.tensor<readonly:tensor<546xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<25x512xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x546xf32>>
- %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<25x546xf32>>
- %9 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [546], strides = [1]
- : !flow.dispatch.tensor<readonly:tensor<546xf32>> -> tensor<546xf32>
- %11 = tensor.empty() : tensor<25x546xf32>
- %13 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [25, 512], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<25x512xf32>> -> tensor<25x512xf32>
- %15 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [512, 546], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<512x546xf32>> -> tensor<512x546xf32>
- %16 = tensor.empty() : tensor<25x546xf32>
- %17 = linalg.fill ins(%cst : f32) outs(%16 : tensor<25x546xf32>) -> tensor<25x546xf32>
- %18 = linalg.matmul ins(%13, %15 : tensor<25x512xf32>, tensor<512x546xf32>) outs(%17 : tensor<25x546xf32>) -> tensor<25x546xf32>
- %19 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%9, %18 : tensor<546xf32>, tensor<25x546xf32>) outs(%11 : tensor<25x546xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
- ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
- %20 = arith.addf %arg2, %arg3 : f32
- linalg.yield %20 : f32
- } -> tensor<25x546xf32>
- flow.dispatch.tensor.store %19, %3, offsets = [0, 0], sizes = [25, 546], strides = [1, 1]
- : tensor<25x546xf32> -> !flow.dispatch.tensor<writeonly:tensor<25x546xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1) -> (d1)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @matmul_25x546() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c15842560 = arith.constant 15842560 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %c25 = arith.constant 25 : index
+ %c546 = arith.constant 546 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c15842560) : !flow.dispatch.tensor<readonly:tensor<546xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<25x512xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x546xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<25x546xf32>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [546], strides = [1] : !flow.dispatch.tensor<readonly:tensor<546xf32>> -> tensor<546xf32>
+ %5 = tensor.empty() : tensor<25x546xf32>
+ %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [25, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<25x512xf32>> -> tensor<25x512xf32>
+ %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [512, 546], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x546xf32>> -> tensor<512x546xf32>
+ %8 = tensor.empty() : tensor<25x546xf32>
+ %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<25x546xf32>) -> tensor<25x546xf32>
+ %10 = linalg.matmul ins(%6, %7 : tensor<25x512xf32>, tensor<512x546xf32>) outs(%9 : tensor<25x546xf32>) -> tensor<25x546xf32>
+ %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%4, %10 : tensor<546xf32>, tensor<25x546xf32>) outs(%5 : tensor<25x546xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %12 = arith.addf %in, %in_0 : f32
+ linalg.yield %12 : f32
+ } -> tensor<25x546xf32>
+ flow.dispatch.tensor.store %11, %3, offsets = [0, 0], sizes = [25, 546], strides = [1, 1] : tensor<25x546xf32> -> !flow.dispatch.tensor<writeonly:tensor<25x546xf32>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 2], [1, 1]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute>
-// CHECK-LABEL: hal.executable.export public @matmul_25x546
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [2 : index, 32 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [2, 32, 1]>
// CHECK: func.func @matmul_25x546()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
// Matmul with consumer pointwise ops
-#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
-#map1 = affine_map<(d0)[s0] -> (s0, -d0 + 256)>
-#map2 = affine_map<(d0)[s0] -> (s0, -d0 + 1024)>
-#map3 = affine_map<(d0)[s0] -> (-d0 + 256, s0)>
-#map4 = affine_map<(d0)[s0] -> (-d0 + 1024, s0)>
-#map5 = affine_map<(d0, d1) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_pointwise_256x1024 {
- hal.executable.variant public @vulkan_spirv_fb target(#hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export public @matmul_pointwise_256x1024 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_pointwise_256x1024() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %c256 = arith.constant 256 : index
- %c1024 = arith.constant 1024 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x128xf16>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x1024xf16>>
- %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
- %11 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>> -> tensor<256x1024xf16>
- %12 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>> -> tensor<256x1024xf16>
- %13 = tensor.empty() : tensor<256x1024xf16>
- %15 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 128], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x128xf16>> -> tensor<256x128xf16>
- %17 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<128x1024xf16>> -> tensor<128x1024xf16>
- %18 = tensor.empty() : tensor<256x1024xf16>
- %19 = linalg.fill ins(%cst : f16) outs(%18 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
- %20 = linalg.matmul ins(%15, %17 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%19 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
- %21 = linalg.generic {
- indexing_maps = [#map5, #map5, #map5, #map5], iterator_types = ["parallel", "parallel"]}
- ins(%20, %11, %12 : tensor<256x1024xf16>, tensor<256x1024xf16>, tensor<256x1024xf16>) outs(%13 : tensor<256x1024xf16>) {
- ^bb0(%arg2: f16, %arg3: f16, %arg4: f16, %arg5: f16): // no predecessors
- %22 = arith.divf %arg2, %arg3 : f16
- %23 = arith.subf %22, %arg4 : f16
- linalg.yield %23 : f16
- } -> tensor<256x1024xf16>
- flow.dispatch.tensor.store %21, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : tensor<256x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @matmul_pointwise_256x1024() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %c256 = arith.constant 256 : index
+ %c1024 = arith.constant 1024 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x128xf16>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x1024xf16>>
+ %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
+ %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>> -> tensor<256x1024xf16>
+ %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>> -> tensor<256x1024xf16>
+ %7 = tensor.empty() : tensor<256x1024xf16>
+ %8 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xf16>> -> tensor<256x128xf16>
+ %9 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x1024xf16>> -> tensor<128x1024xf16>
+ %10 = tensor.empty() : tensor<256x1024xf16>
+ %11 = linalg.fill ins(%cst : f16) outs(%10 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
+ %12 = linalg.matmul ins(%8, %9 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%11 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
+ %13 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%12, %5, %6 : tensor<256x1024xf16>, tensor<256x1024xf16>, tensor<256x1024xf16>) outs(%7 : tensor<256x1024xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
+ %14 = arith.divf %in, %in_0 : f16
+ %15 = arith.subf %14, %in_1 : f16
+ linalg.yield %15 : f16
+ } -> tensor<256x1024xf16>
+ flow.dispatch.tensor.store %13, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 256], [8, 8], [0, 0, 8]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK-LABEL: hal.executable.export public @matmul_pointwise_256x1024
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [32, 2, 1]>
// CHECK: func.func @matmul_pointwise_256x1024()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matvec.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matvec.mlir
index 6a17766..ba468be 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matvec.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matvec.mlir
@@ -1,610 +1,415 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-
-hal.executable @i4_dequant_matvec_f32 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @i4_dequant_matvec_f32 layout(#pipeline_layout)
- builtin.module {
- func.func @i4_dequant_matvec_f32() {
- %cst = arith.constant 0.000000e+00 : f32
- %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
- %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
- %12 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
- %13 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<86x128xf32>>
- %14 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
- %15 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
- %16 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
- %17 = flow.dispatch.tensor.load %12, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
- %18 = flow.dispatch.tensor.load %13, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<86x128xf32>> -> tensor<86x128xf32>
- %19 = tensor.empty() : tensor<4096xf32>
- %20 = tensor.empty() : tensor<4096x86x128xf32>
- %21 = linalg.fill ins(%cst : f32) outs(%19 : tensor<4096xf32>) -> tensor<4096xf32>
- %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%15, %16, %17 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%20 : tensor<4096x86x128xf32>) {
- ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
- %24 = arith.extui %in : i4 to i32
- %25 = arith.uitofp %24 : i32 to f32
- %26 = arith.subf %25, %in_1 : f32
- %27 = arith.mulf %26, %in_0 : f32
- linalg.yield %27 : f32
- } -> tensor<4096x86x128xf32>
- %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%18, %22 : tensor<86x128xf32>, tensor<4096x86x128xf32>) outs(%21 : tensor<4096xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %24 = arith.mulf %in, %in_0 : f32
- %25 = arith.addf %24, %out : f32
- linalg.yield %25 : f32
- } -> tensor<4096xf32>
- flow.dispatch.tensor.store %23, %14, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0)>
+module {
+ func.func @i4_dequant_matvec_f32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<86x128xf32>>
+ %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
+ %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
+ %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
+ %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
+ %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<86x128xf32>> -> tensor<86x128xf32>
+ %9 = tensor.empty() : tensor<4096xf32>
+ %10 = tensor.empty() : tensor<4096x86x128xf32>
+ %11 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4096xf32>) -> tensor<4096xf32>
+ %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%10 : tensor<4096x86x128xf32>) {
+ ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
+ %14 = arith.extui %in : i4 to i32
+ %15 = arith.uitofp %14 : i32 to f32
+ %16 = arith.subf %15, %in_1 : f32
+ %17 = arith.mulf %16, %in_0 : f32
+ linalg.yield %17 : f32
+ } -> tensor<4096x86x128xf32>
+ %13 = linalg.generic {indexing_maps = [#map2, #map, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<86x128xf32>, tensor<4096x86x128xf32>) outs(%11 : tensor<4096xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %14 = arith.mulf %in, %in_0 : f32
+ %15 = arith.addf %14, %out : f32
+ linalg.yield %15 : f32
+ } -> tensor<4096xf32>
+ flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1], [0, 2, 128]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce>
-// CHECK-LABEL: hal.executable.export public @i4_dequant_matvec_f32
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce workgroup_size = [64, 1, 1]>
// CHECK: func.func @i4_dequant_matvec_f32()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-
-hal.executable @i4_dequant_matvec_f32 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @i4_dequant_matvec_f32 layout(#pipeline_layout)
- builtin.module {
- func.func @i4_dequant_matvec_f32() {
- %c32_i64 = arith.constant 32 : i64
- %cst = arith.constant 0.000000e+00 : f32
- %c4294967296_i64 = arith.constant 4294967296 : i64
- %22 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>>
- %23 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x32x1xf32>>
- %24 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x32x1xf32>>
- %25 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x1x32x128xf32>>
- %26 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x1x4096xf32>>
- %27 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>> -> tensor<4096x32x128xi4>
- %28 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0], sizes = [4096, 32, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x1xf32>> -> tensor<4096x32x1xf32>
- %29 = flow.dispatch.tensor.load %24, offsets = [0, 0, 0], sizes = [4096, 32, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x1xf32>> -> tensor<4096x32x1xf32>
- %30 = flow.dispatch.tensor.load %25, offsets = [0, 0, 0, 0], sizes = [1, 1, 32, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x32x128xf32>> -> tensor<1x1x32x128xf32>
- %31 = tensor.empty() : tensor<1x1x4096xf32>
- %32 = tensor.empty() : tensor<4096x32x128xf32>
- %33 = linalg.fill ins(%cst : f32) outs(%31 : tensor<1x1x4096xf32>) -> tensor<1x1x4096xf32>
- %34 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d1, 0)>,
- affine_map<(d0, d1, d2) -> (d0, d1, 0)>,
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%27, %28, %29 : tensor<4096x32x128xi4>, tensor<4096x32x1xf32>, tensor<4096x32x1xf32>) outs(%32 : tensor<4096x32x128xf32>) {
- ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
- %36 = arith.extui %in : i4 to i32
- %37 = arith.uitofp %36 : i32 to f32
- %38 = arith.subf %37, %in_1 : f32
- %39 = arith.mulf %38, %in_0 : f32
- linalg.yield %39 : f32
- } -> tensor<4096x32x128xf32>
- %35 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>,
- affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>,
- affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]}
- ins(%30, %34 : tensor<1x1x32x128xf32>, tensor<4096x32x128xf32>) outs(%33 : tensor<1x1x4096xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %36 = arith.mulf %in, %in_0 : f32
- %37 = arith.addf %36, %out : f32
- linalg.yield %37 : f32
- } -> tensor<1x1x4096xf32>
- flow.dispatch.tensor.store %35, %26, offsets = [0, 0, 0], sizes = [1, 1, 4096], strides = [1, 1, 1] : tensor<1x1x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x1x4096xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, 0)>
+#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>
+#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>
+#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
+module {
+ func.func @i4_dequant_matvec_f32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c32_i64 = arith.constant 32 : i64
+ %cst = arith.constant 0.000000e+00 : f32
+ %c4294967296_i64 = arith.constant 4294967296 : i64
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>>
+ %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x32x1xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x32x1xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x1x32x128xf32>>
+ %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x1x4096xf32>>
+ %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>> -> tensor<4096x32x128xi4>
+ %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4096, 32, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x1xf32>> -> tensor<4096x32x1xf32>
+ %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [4096, 32, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x1xf32>> -> tensor<4096x32x1xf32>
+ %8 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [1, 1, 32, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x32x128xf32>> -> tensor<1x1x32x128xf32>
+ %9 = tensor.empty() : tensor<1x1x4096xf32>
+ %10 = tensor.empty() : tensor<4096x32x128xf32>
+ %11 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x1x4096xf32>) -> tensor<1x1x4096xf32>
+ %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32x1xf32>, tensor<4096x32x1xf32>) outs(%10 : tensor<4096x32x128xf32>) {
+ ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
+ %14 = arith.extui %in : i4 to i32
+ %15 = arith.uitofp %14 : i32 to f32
+ %16 = arith.subf %15, %in_1 : f32
+ %17 = arith.mulf %16, %in_0 : f32
+ linalg.yield %17 : f32
+ } -> tensor<4096x32x128xf32>
+ %13 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<1x1x32x128xf32>, tensor<4096x32x128xf32>) outs(%11 : tensor<1x1x4096xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %14 = arith.mulf %in, %in_0 : f32
+ %15 = arith.addf %14, %out : f32
+ linalg.yield %15 : f32
+ } -> tensor<1x1x4096xf32>
+ flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0], sizes = [1, 1, 4096], strides = [1, 1, 1] : tensor<1x1x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x1x4096xf32>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 1], [0, 0, 0, 4, 128]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce>
-// CHECK-LABEL: hal.executable.export public @i4_dequant_matvec_f32
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [128 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce workgroup_size = [128, 1, 1]>
// CHECK: func.func @i4_dequant_matvec_f32()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-
-hal.executable @i4_dequant_matvec_f32 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @i4_dequant_matvec_f32 layout(#pipeline_layout)
- builtin.module {
- func.func @i4_dequant_matvec_f32() {
- %c32_i64 = arith.constant 32 : i64
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = hal.interface.constant.load[4] : i32
- %5 = hal.interface.constant.load[5] : i32
- %6 = hal.interface.constant.load[6] : i32
- %7 = hal.interface.constant.load[7] : i32
- %8 = hal.interface.constant.load[8] : i32
- %9 = arith.index_castui %0 : i32 to index
- %10 = arith.index_castui %1 : i32 to index
- %11 = arith.index_castui %2 : i32 to index
- %12 = arith.extui %3 : i32 to i64
- %13 = arith.extui %4 : i32 to i64
- %14 = arith.shli %13, %c32_i64 : i64
- %15 = arith.ori %12, %14 : i64
- %16 = arith.index_castui %15 : i64 to index
- %17 = arith.extui %5 : i32 to i64
- %18 = arith.extui %6 : i32 to i64
- %19 = arith.shli %18, %c32_i64 : i64
- %20 = arith.ori %17, %19 : i64
- %21 = arith.index_castui %20 : i64 to index
- %22 = arith.extui %7 : i32 to i64
- %23 = arith.extui %8 : i32 to i64
- %24 = arith.shli %23, %c32_i64 : i64
- %25 = arith.ori %22, %24 : i64
- %26 = arith.index_castui %25 : i64 to index
- %27 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
- %28 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%10) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
- %29 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%11) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
- %30 = flow.dispatch.workload.ordinal %26, 0 : index
- %31 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%16) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x86x128xf32>>{%30}
- %32 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%21) : !flow.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%30}
- %33 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
- %34 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
- %35 = flow.dispatch.tensor.load %29, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
- %36 = flow.dispatch.tensor.load %31, offsets = [0, 0, 0], sizes = [%30, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x86x128xf32>>{%30} -> tensor<?x86x128xf32>
- %37 = tensor.empty(%30) : tensor<?x4096xf32>
- %38 = tensor.empty() : tensor<4096x86x128xf32>
- %39 = linalg.fill ins(%cst : f32) outs(%37 : tensor<?x4096xf32>) -> tensor<?x4096xf32>
- %40 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d1)>,
- affine_map<(d0, d1, d2) -> (d0, d1)>,
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%33, %34, %35 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%38 : tensor<4096x86x128xf32>) {
- ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
- %42 = arith.extui %in : i4 to i32
- %43 = arith.uitofp %42 : i32 to f32
- %44 = arith.subf %43, %in_1 : f32
- %45 = arith.mulf %44, %in_0 : f32
- linalg.yield %45 : f32
- } -> tensor<4096x86x128xf32>
- %41 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
- ins(%36, %40 : tensor<?x86x128xf32>, tensor<4096x86x128xf32>) outs(%39 : tensor<?x4096xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %42 = arith.mulf %in, %in_0 : f32
- %43 = arith.addf %42, %out : f32
- linalg.yield %43 : f32
- } -> tensor<?x4096xf32>
- flow.dispatch.tensor.store %41, %32, offsets = [0, 0], sizes = [%30, 4096], strides = [1, 1] : tensor<?x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%30}
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>
+#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+module {
+ func.func @i4_dequant_matvec_f32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c32_i64 = arith.constant 32 : i64
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = hal.interface.constant.load[4] : i32
+ %5 = hal.interface.constant.load[5] : i32
+ %6 = hal.interface.constant.load[6] : i32
+ %7 = hal.interface.constant.load[7] : i32
+ %8 = hal.interface.constant.load[8] : i32
+ %9 = arith.index_castui %0 : i32 to index
+ %10 = arith.index_castui %1 : i32 to index
+ %11 = arith.index_castui %2 : i32 to index
+ %12 = arith.extui %3 : i32 to i64
+ %13 = arith.extui %4 : i32 to i64
+ %14 = arith.shli %13, %c32_i64 : i64
+ %15 = arith.ori %12, %14 : i64
+ %16 = arith.index_castui %15 : i64 to index
+ %17 = arith.extui %5 : i32 to i64
+ %18 = arith.extui %6 : i32 to i64
+ %19 = arith.shli %18, %c32_i64 : i64
+ %20 = arith.ori %17, %19 : i64
+ %21 = arith.index_castui %20 : i64 to index
+ %22 = arith.extui %7 : i32 to i64
+ %23 = arith.extui %8 : i32 to i64
+ %24 = arith.shli %23, %c32_i64 : i64
+ %25 = arith.ori %22, %24 : i64
+ %26 = arith.index_castui %25 : i64 to index
+ %27 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
+ %28 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%10) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
+ %29 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%11) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
+ %30 = flow.dispatch.workload.ordinal %26, 0 : index
+ %31 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%16) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x86x128xf32>>{%30}
+ %32 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%21) : !flow.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%30}
+ %33 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
+ %34 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
+ %35 = flow.dispatch.tensor.load %29, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
+ %36 = flow.dispatch.tensor.load %31, offsets = [0, 0, 0], sizes = [%30, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x86x128xf32>>{%30} -> tensor<?x86x128xf32>
+ %37 = tensor.empty(%30) : tensor<?x4096xf32>
+ %38 = tensor.empty() : tensor<4096x86x128xf32>
+ %39 = linalg.fill ins(%cst : f32) outs(%37 : tensor<?x4096xf32>) -> tensor<?x4096xf32>
+ %40 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%33, %34, %35 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%38 : tensor<4096x86x128xf32>) {
+ ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
+ %42 = arith.extui %in : i4 to i32
+ %43 = arith.uitofp %42 : i32 to f32
+ %44 = arith.subf %43, %in_1 : f32
+ %45 = arith.mulf %44, %in_0 : f32
+ linalg.yield %45 : f32
+ } -> tensor<4096x86x128xf32>
+ %41 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%36, %40 : tensor<?x86x128xf32>, tensor<4096x86x128xf32>) outs(%39 : tensor<?x4096xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %42 = arith.mulf %in, %in_0 : f32
+ %43 = arith.addf %42, %out : f32
+ linalg.yield %43 : f32
+ } -> tensor<?x4096xf32>
+ flow.dispatch.tensor.store %41, %32, offsets = [0, 0], sizes = [%30, 4096], strides = [1, 1] : tensor<?x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%30}
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 2, 128]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce>
-// CHECK-LABEL: hal.executable.export public @i4_dequant_matvec_f32
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce workgroup_size = [64, 1, 1]>
// CHECK: func.func @i4_dequant_matvec_f32()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-hal.executable @i4_dequant_matvec_f16 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.4, [Shader, Float16, StorageBuffer16BitAccess, GroupNonUniform, GroupNonUniformShuffle], [SPV_KHR_16bit_storage]>,
- Unknown:IntegratedGPU,
- #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @i4_dequant_matvec_f16 layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @i4_dequant_matvec_f16() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86x1xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86x1xf16>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x86x128xf16>>
- %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1x4096xf16>>
- %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
- %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4096, 86, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x1xf16>> -> tensor<4096x86x1xf16>
- %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [4096, 86, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x1xf16>> -> tensor<4096x86x1xf16>
- %8 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [1, 1, 86, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x86x128xf16>> -> tensor<1x1x86x128xf16>
- %9 = tensor.empty() : tensor<1x1x4096xf16>
- %10 = tensor.empty() : tensor<4096x86x128xf16>
- %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x1x4096xf16>) -> tensor<1x1x4096xf16>
- %12 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d1, 0)>,
- affine_map<(d0, d1, d2) -> (d0, d1, 0)>,
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%5, %6, %7 : tensor<4096x86x128xi4>, tensor<4096x86x1xf16>, tensor<4096x86x1xf16>) outs(%10 : tensor<4096x86x128xf16>) {
- ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16):
- %14 = arith.extui %in : i4 to i32
- %15 = arith.uitofp %14 : i32 to f16
- %16 = arith.subf %15, %in_1 : f16
- %17 = arith.mulf %16, %in_0 : f16
- linalg.yield %17 : f16
- } -> tensor<4096x86x128xf16>
- %13 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>,
- affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>,
- affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]}
- ins(%8, %12 : tensor<1x1x86x128xf16>, tensor<4096x86x128xf16>) outs(%11 : tensor<1x1x4096xf16>) {
- ^bb0(%in: f16, %in_0: f16, %out: f16):
- %14 = arith.mulf %in, %in_0 : f16
- %15 = arith.addf %14, %out : f16
- linalg.yield %15 : f16
- } -> tensor<1x1x4096xf16>
- flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0], sizes = [1, 1, 4096], strides = [1, 1, 1] : tensor<1x1x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x1x4096xf16>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, Float16, StorageBuffer16BitAccess, GroupNonUniform, GroupNonUniformShuffle], [SPV_KHR_16bit_storage]>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64]>>}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, 0)>
+#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>
+#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>
+#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
+module {
+ func.func @i4_dequant_matvec_f16() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86x1xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86x1xf16>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x86x128xf16>>
+ %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x1x4096xf16>>
+ %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
+ %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4096, 86, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x1xf16>> -> tensor<4096x86x1xf16>
+ %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [4096, 86, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x1xf16>> -> tensor<4096x86x1xf16>
+ %8 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [1, 1, 86, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x86x128xf16>> -> tensor<1x1x86x128xf16>
+ %9 = tensor.empty() : tensor<1x1x4096xf16>
+ %10 = tensor.empty() : tensor<4096x86x128xf16>
+ %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x1x4096xf16>) -> tensor<1x1x4096xf16>
+ %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x86x128xi4>, tensor<4096x86x1xf16>, tensor<4096x86x1xf16>) outs(%10 : tensor<4096x86x128xf16>) {
+ ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16):
+ %14 = arith.extui %in : i4 to i32
+ %15 = arith.uitofp %14 : i32 to f16
+ %16 = arith.subf %15, %in_1 : f16
+ %17 = arith.mulf %16, %in_0 : f16
+ linalg.yield %17 : f16
+ } -> tensor<4096x86x128xf16>
+ %13 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<1x1x86x128xf16>, tensor<4096x86x128xf16>) outs(%11 : tensor<1x1x4096xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %out: f16):
+ %14 = arith.mulf %in, %in_0 : f16
+ %15 = arith.addf %14, %out : f16
+ linalg.yield %15 : f16
+ } -> tensor<1x1x4096xf16>
+ flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0], sizes = [1, 1, 4096], strides = [1, 1, 1] : tensor<1x1x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<1x1x4096xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 1], [0, 0, 0, 2, 128]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce>
-// CHECK-LABEL: hal.executable.export public @i4_dequant_matvec_f16
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce workgroup_size = [32, 1, 1]>
// CHECK: func.func @i4_dequant_matvec_f16()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-
-hal.executable @i4_dequant_matvec {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @i4_dequant_matvec layout(#pipeline_layout)
- builtin.module {
- func.func @i4_dequant_matvec() {
- %c32_i64 = arith.constant 32 : i64
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = hal.interface.constant.load[4] : i32
- %5 = hal.interface.constant.load[5] : i32
- %6 = hal.interface.constant.load[6] : i32
- %7 = hal.interface.constant.load[7] : i32
- %8 = hal.interface.constant.load[8] : i32
- %9 = arith.index_castui %0 : i32 to index
- %10 = arith.index_castui %1 : i32 to index
- %11 = arith.index_castui %2 : i32 to index
- %12 = arith.extui %3 : i32 to i64
- %13 = arith.extui %4 : i32 to i64
- %14 = arith.shli %13, %c32_i64 : i64
- %15 = arith.ori %12, %14 : i64
- %16 = arith.index_castui %15 : i64 to index
- %17 = arith.extui %5 : i32 to i64
- %18 = arith.extui %6 : i32 to i64
- %19 = arith.shli %18, %c32_i64 : i64
- %20 = arith.ori %17, %19 : i64
- %21 = arith.index_castui %20 : i64 to index
- %22 = arith.extui %7 : i32 to i64
- %23 = arith.extui %8 : i32 to i64
- %24 = arith.shli %23, %c32_i64 : i64
- %25 = arith.ori %22, %24 : i64
- %26 = arith.index_castui %25 : i64 to index
- %27 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
- %28 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%10) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
- %29 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%11) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
- %30 = flow.dispatch.workload.ordinal %26, 0 : index
- %31 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%16) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x86x128xf32>>{%30}
- %32 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%21) : !flow.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%30}
- %33 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
- %34 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
- %35 = flow.dispatch.tensor.load %29, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
- %36 = flow.dispatch.tensor.load %31, offsets = [0, 0, 0], sizes = [%30, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x86x128xf32>>{%30} -> tensor<?x86x128xf32>
- %37 = tensor.empty(%30) : tensor<?x4096xf32>
- %38 = tensor.empty() : tensor<4096x86x128xf32>
- %39 = linalg.fill ins(%cst : f32) outs(%37 : tensor<?x4096xf32>) -> tensor<?x4096xf32>
- %40 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d1)>,
- affine_map<(d0, d1, d2) -> (d0, d1)>,
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%33, %34, %35 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%38 : tensor<4096x86x128xf32>) {
- ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
- %42 = arith.extui %in : i4 to i32
- %43 = arith.uitofp %42 : i32 to f32
- %44 = arith.subf %43, %in_1 : f32
- %45 = arith.mulf %44, %in_0 : f32
- linalg.yield %45 : f32
- } -> tensor<4096x86x128xf32>
- %41 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
- ins(%36, %40 : tensor<?x86x128xf32>, tensor<4096x86x128xf32>) outs(%39 : tensor<?x4096xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %42 = arith.mulf %in, %in_0 : f32
- %43 = arith.addf %42, %out : f32
- linalg.yield %43 : f32
- } -> tensor<?x4096xf32>
- flow.dispatch.tensor.store %41, %32, offsets = [0, 0], sizes = [%30, 4096], strides = [1, 1] : tensor<?x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%30}
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>
+#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+module {
+ func.func @i4_dequant_matvec() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c32_i64 = arith.constant 32 : i64
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = hal.interface.constant.load[4] : i32
+ %5 = hal.interface.constant.load[5] : i32
+ %6 = hal.interface.constant.load[6] : i32
+ %7 = hal.interface.constant.load[7] : i32
+ %8 = hal.interface.constant.load[8] : i32
+ %9 = arith.index_castui %0 : i32 to index
+ %10 = arith.index_castui %1 : i32 to index
+ %11 = arith.index_castui %2 : i32 to index
+ %12 = arith.extui %3 : i32 to i64
+ %13 = arith.extui %4 : i32 to i64
+ %14 = arith.shli %13, %c32_i64 : i64
+ %15 = arith.ori %12, %14 : i64
+ %16 = arith.index_castui %15 : i64 to index
+ %17 = arith.extui %5 : i32 to i64
+ %18 = arith.extui %6 : i32 to i64
+ %19 = arith.shli %18, %c32_i64 : i64
+ %20 = arith.ori %17, %19 : i64
+ %21 = arith.index_castui %20 : i64 to index
+ %22 = arith.extui %7 : i32 to i64
+ %23 = arith.extui %8 : i32 to i64
+ %24 = arith.shli %23, %c32_i64 : i64
+ %25 = arith.ori %22, %24 : i64
+ %26 = arith.index_castui %25 : i64 to index
+ %27 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
+ %28 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%10) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
+ %29 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%11) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
+ %30 = flow.dispatch.workload.ordinal %26, 0 : index
+ %31 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%16) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x86x128xf32>>{%30}
+ %32 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%21) : !flow.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%30}
+ %33 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
+ %34 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
+ %35 = flow.dispatch.tensor.load %29, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
+ %36 = flow.dispatch.tensor.load %31, offsets = [0, 0, 0], sizes = [%30, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x86x128xf32>>{%30} -> tensor<?x86x128xf32>
+ %37 = tensor.empty(%30) : tensor<?x4096xf32>
+ %38 = tensor.empty() : tensor<4096x86x128xf32>
+ %39 = linalg.fill ins(%cst : f32) outs(%37 : tensor<?x4096xf32>) -> tensor<?x4096xf32>
+ %40 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%33, %34, %35 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%38 : tensor<4096x86x128xf32>) {
+ ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
+ %42 = arith.extui %in : i4 to i32
+ %43 = arith.uitofp %42 : i32 to f32
+ %44 = arith.subf %43, %in_1 : f32
+ %45 = arith.mulf %44, %in_0 : f32
+ linalg.yield %45 : f32
+ } -> tensor<4096x86x128xf32>
+ %41 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%36, %40 : tensor<?x86x128xf32>, tensor<4096x86x128xf32>) outs(%39 : tensor<?x4096xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %42 = arith.mulf %in, %in_0 : f32
+ %43 = arith.addf %42, %out : f32
+ linalg.yield %43 : f32
+ } -> tensor<?x4096xf32>
+ flow.dispatch.tensor.store %41, %32, offsets = [0, 0], sizes = [%30, 4096], strides = [1, 1] : tensor<?x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x4096xf32>>{%30}
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 2, 128]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce>
-// CHECK-LABEL: hal.executable.export public @i4_dequant_matvec
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce workgroup_size = [64, 1, 1]>
// CHECK: func.func @i4_dequant_matvec()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-
-hal.executable @i4_dequant_matvec {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @i4_dequant_matvec layout(#pipeline_layout)
- builtin.module {
- func.func @i4_dequant_matvec() {
- %c32_i64 = arith.constant 32 : i64
- %cst = arith.constant 0.000000e+00 : f16
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = hal.interface.constant.load[4] : i32
- %5 = hal.interface.constant.load[5] : i32
- %6 = hal.interface.constant.load[6] : i32
- %7 = arith.index_castui %0 : i32 to index
- %8 = arith.index_castui %1 : i32 to index
- %9 = arith.index_castui %2 : i32 to index
- %10 = arith.extui %3 : i32 to i64
- %11 = arith.extui %4 : i32 to i64
- %12 = arith.shli %11, %c32_i64 : i64
- %13 = arith.ori %10, %12 : i64
- %14 = arith.index_castui %13 : i64 to index
- %15 = arith.extui %5 : i32 to i64
- %16 = arith.extui %6 : i32 to i64
- %17 = arith.shli %16, %c32_i64 : i64
- %18 = arith.ori %15, %17 : i64
- %19 = arith.index_castui %18 : i64 to index
- %20 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<11008x32x128xi4>>
- %21 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<11008x32xf16>>
- %22 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<11008x32xf16>>
- %23 = flow.dispatch.workload.ordinal %19, 0 : index
- %24 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x32x128xf16>>{%23}
- %25 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%14) : !flow.dispatch.tensor<writeonly:tensor<?x11008xf16>>{%23}
- %26 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0], sizes = [11008, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<11008x32x128xi4>> -> tensor<11008x32x128xi4>
- %27 = flow.dispatch.tensor.load %21, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11008x32xf16>> -> tensor<11008x32xf16>
- %28 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11008x32xf16>> -> tensor<11008x32xf16>
- %29 = flow.dispatch.tensor.load %24, offsets = [0, 0, 0], sizes = [%23, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x32x128xf16>>{%23} -> tensor<?x32x128xf16>
- %30 = tensor.empty() : tensor<11008x32x128xf16>
- %31 = tensor.empty(%23) : tensor<?x11008xf16>
- %32 = linalg.fill ins(%cst : f16) outs(%31 : tensor<?x11008xf16>) -> tensor<?x11008xf16>
- %33 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d1)>,
- affine_map<(d0, d1, d2) -> (d0, d1)>,
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%26, %27, %28 : tensor<11008x32x128xi4>, tensor<11008x32xf16>, tensor<11008x32xf16>)
- outs(%30 : tensor<11008x32x128xf16>) {
- ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16):
- %35 = arith.extui %in : i4 to i32
- %36 = arith.uitofp %35 : i32 to f16
- %37 = arith.subf %36, %in_1 : f16
- %38 = arith.mulf %37, %in_0 : f16
- linalg.yield %38 : f16
- } -> tensor<11008x32x128xf16>
- %34 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel", "reduction", "reduction"]}
- ins(%29, %33 : tensor<?x32x128xf16>, tensor<11008x32x128xf16>) outs(%32 : tensor<?x11008xf16>) {
- ^bb0(%in: f16, %in_0: f16, %out: f16):
- %35 = arith.mulf %in, %in_0 : f16
- %36 = arith.addf %35, %out : f16
- linalg.yield %36 : f16
- } -> tensor<?x11008xf16>
- flow.dispatch.tensor.store %34, %25, offsets = [0, 0], sizes = [%23, 11008], strides = [1, 1] : tensor<?x11008xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x11008xf16>>{%23}
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>
+#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+module {
+ func.func @i4_dequant_matvec() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c32_i64 = arith.constant 32 : i64
+ %cst = arith.constant 0.000000e+00 : f16
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = hal.interface.constant.load[4] : i32
+ %5 = hal.interface.constant.load[5] : i32
+ %6 = hal.interface.constant.load[6] : i32
+ %7 = arith.index_castui %0 : i32 to index
+ %8 = arith.index_castui %1 : i32 to index
+ %9 = arith.index_castui %2 : i32 to index
+ %10 = arith.extui %3 : i32 to i64
+ %11 = arith.extui %4 : i32 to i64
+ %12 = arith.shli %11, %c32_i64 : i64
+ %13 = arith.ori %10, %12 : i64
+ %14 = arith.index_castui %13 : i64 to index
+ %15 = arith.extui %5 : i32 to i64
+ %16 = arith.extui %6 : i32 to i64
+ %17 = arith.shli %16, %c32_i64 : i64
+ %18 = arith.ori %15, %17 : i64
+ %19 = arith.index_castui %18 : i64 to index
+ %20 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<11008x32x128xi4>>
+ %21 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<11008x32xf16>>
+ %22 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<11008x32xf16>>
+ %23 = flow.dispatch.workload.ordinal %19, 0 : index
+ %24 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x32x128xf16>>{%23}
+ %25 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%14) : !flow.dispatch.tensor<writeonly:tensor<?x11008xf16>>{%23}
+ %26 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0], sizes = [11008, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<11008x32x128xi4>> -> tensor<11008x32x128xi4>
+ %27 = flow.dispatch.tensor.load %21, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11008x32xf16>> -> tensor<11008x32xf16>
+ %28 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<11008x32xf16>> -> tensor<11008x32xf16>
+ %29 = flow.dispatch.tensor.load %24, offsets = [0, 0, 0], sizes = [%23, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x32x128xf16>>{%23} -> tensor<?x32x128xf16>
+ %30 = tensor.empty() : tensor<11008x32x128xf16>
+ %31 = tensor.empty(%23) : tensor<?x11008xf16>
+ %32 = linalg.fill ins(%cst : f16) outs(%31 : tensor<?x11008xf16>) -> tensor<?x11008xf16>
+ %33 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%26, %27, %28 : tensor<11008x32x128xi4>, tensor<11008x32xf16>, tensor<11008x32xf16>) outs(%30 : tensor<11008x32x128xf16>) {
+ ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16):
+ %35 = arith.extui %in : i4 to i32
+ %36 = arith.uitofp %35 : i32 to f16
+ %37 = arith.subf %36, %in_1 : f16
+ %38 = arith.mulf %37, %in_0 : f16
+ linalg.yield %38 : f16
+ } -> tensor<11008x32x128xf16>
+ %34 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29, %33 : tensor<?x32x128xf16>, tensor<11008x32x128xf16>) outs(%32 : tensor<?x11008xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %out: f16):
+ %35 = arith.mulf %in, %in_0 : f16
+ %36 = arith.addf %35, %out : f16
+ linalg.yield %36 : f16
+ } -> tensor<?x11008xf16>
+ flow.dispatch.tensor.store %34, %25, offsets = [0, 0], sizes = [%23, 11008], strides = [1, 1] : tensor<?x11008xf16> -> !flow.dispatch.tensor<writeonly:tensor<?x11008xf16>>{%23}
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 4, 128]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce>
-// CHECK-LABEL: hal.executable.export public @i4_dequant_matvec
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce workgroup_size = [64, 1, 1]>
// CHECK: func.func @i4_dequant_matvec()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-hal.executable @dynamic_batch_matvec {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @dynamic_batch_matvec layout(#pipeline_layout)
- builtin.module {
- func.func @dynamic_batch_matvec() {
- %c32_i64 = arith.constant 32 : i64
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = hal.interface.constant.load[4] : i32
- %13 = arith.index_castui %0 : i32 to index
- %18 = arith.index_castui %1 : i32 to index
- %19 = arith.index_castui %2 : i32 to index
- %24 = arith.index_castui %3 : i32 to index
- %29 = arith.index_castui %4 : i32 to index
- %30 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%19) : !flow.dispatch.tensor<writeonly:tensor<32x1x128xf16>>
- %31 = flow.dispatch.workload.ordinal %24, 0 : index
- %32 = flow.dispatch.workload.ordinal %29, 1 : index
- %33 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%13) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x1x?xf16>>{%31}
- %34 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%18) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x?x128xf16>>{%32}
- %35 = flow.dispatch.tensor.load %33, offsets = [0, 0, 0], sizes = [32, 1, %31], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x1x?xf16>>{%31} -> tensor<32x1x?xf16>
- %36 = flow.dispatch.tensor.load %34, offsets = [0, 0, 0], sizes = [32, %32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x?x128xf16>>{%32} -> tensor<32x?x128xf16>
- %37 = tensor.empty() : tensor<32x1x128xf16>
- %38 = linalg.fill ins(%cst : f16) outs(%37 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16>
- %39 = linalg.batch_matmul ins(%35, %36 : tensor<32x1x?xf16>, tensor<32x?x128xf16>) outs(%38 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16>
- flow.dispatch.tensor.store %39, %30, offsets = [0, 0, 0], sizes = [32, 1, 128], strides = [1, 1, 1] : tensor<32x1x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<32x1x128xf16>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+module {
+ func.func @dynamic_batch_matvec() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c32_i64 = arith.constant 32 : i64
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = hal.interface.constant.load[4] : i32
+ %5 = arith.index_castui %0 : i32 to index
+ %6 = arith.index_castui %1 : i32 to index
+ %7 = arith.index_castui %2 : i32 to index
+ %8 = arith.index_castui %3 : i32 to index
+ %9 = arith.index_castui %4 : i32 to index
+ %10 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%7) : !flow.dispatch.tensor<writeonly:tensor<32x1x128xf16>>
+ %11 = flow.dispatch.workload.ordinal %8, 0 : index
+ %12 = flow.dispatch.workload.ordinal %9, 1 : index
+ %13 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x1x?xf16>>{%11}
+ %14 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x?x128xf16>>{%12}
+ %15 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [32, 1, %11], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x1x?xf16>>{%11} -> tensor<32x1x?xf16>
+ %16 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0], sizes = [32, %12, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x?x128xf16>>{%12} -> tensor<32x?x128xf16>
+ %17 = tensor.empty() : tensor<32x1x128xf16>
+ %18 = linalg.fill ins(%cst : f16) outs(%17 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16>
+ %19 = linalg.batch_matmul ins(%15, %16 : tensor<32x1x?xf16>, tensor<32x?x128xf16>) outs(%18 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16>
+ flow.dispatch.tensor.store %19, %10, offsets = [0, 0, 0], sizes = [32, 1, 128], strides = [1, 1, 1] : tensor<32x1x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<32x1x128xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 1], [0, 0, 0, 64]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce>
-// CHECK-LABEL: hal.executable.export public @dynamic_batch_matvec
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce workgroup_size = [64, 1, 1]>
// CHECK: func.func @dynamic_batch_matvec()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.batch_matmul
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_misc.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_misc.mlir
index dc16b19..9273827 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_misc.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_misc.mlir
@@ -1,68 +1,46 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer, ReadOnly>,
- #hal.descriptor_set.binding<1, storage_buffer, ReadOnly>,
- #hal.descriptor_set.binding<2, storage_buffer, ReadOnly>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable private @complex_executable {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniformShuffle], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export public @complex_view_as_real ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @complex_view_as_real() {
- %c1 = arith.constant 1 : index
- %c0 = arith.constant 0 : index
- %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
- %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x50xcomplex<f32>>>
- %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>>
- %7 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x50x2xf32>>
- %8 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
- %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0, 0], sizes = [1, 1, 32, 50, 2], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>> -> tensor<1x1x32x50x2xf32>
- %10 = tensor.empty() : tensor<32x50x2xf32>
- %extracted = tensor.extract %8[%c0] : tensor<1xi32>
- %11 = arith.extsi %extracted : i32 to i64
- %19 = arith.index_cast %11 : i64 to index
- %20 = flow.dispatch.tensor.load %5, offsets = [%19, 0], sizes = [1, 50], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x50xcomplex<f32>>> -> tensor<50xcomplex<f32>>
- %21 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%20 : tensor<50xcomplex<f32>>) outs(%10 : tensor<32x50x2xf32>) {
- ^bb0(%in: complex<f32>, %out: f32):
- %22 = linalg.index 0 : index
- %23 = linalg.index 1 : index
- %extracted_0 = tensor.extract %9[%c0, %c0, %22, %23, %c0] : tensor<1x1x32x50x2xf32>
- %extracted_1 = tensor.extract %9[%c0, %c0, %22, %23, %c1] : tensor<1x1x32x50x2xf32>
- %24 = complex.create %extracted_0, %extracted_1 : complex<f32>
- %25 = complex.mul %24, %in : complex<f32>
- %26 = complex.re %25 : complex<f32>
- %27 = complex.im %25 : complex<f32>
- %28 = linalg.index 2 : index
- %29 = arith.cmpi eq, %28, %c0 : index
- %30 = arith.select %29, %26, %27 : f32
- linalg.yield %30 : f32
- } -> tensor<32x50x2xf32>
- flow.dispatch.tensor.store %21, %7, offsets = [0, 0, 0], sizes = [32, 50, 2], strides = [1, 1, 1] : tensor<32x50x2xf32> -> !flow.dispatch.tensor<writeonly:tensor<32x50x2xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniformShuffle], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+#map = affine_map<(d0, d1, d2) -> (d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+module {
+ func.func @complex_view_as_real() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c1 = arith.constant 1 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x50xcomplex<f32>>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x50x2xf32>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
+ %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0], sizes = [1, 1, 32, 50, 2], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>> -> tensor<1x1x32x50x2xf32>
+ %6 = tensor.empty() : tensor<32x50x2xf32>
+ %extracted = tensor.extract %4[%c0] : tensor<1xi32>
+ %7 = arith.extsi %extracted : i32 to i64
+ %8 = arith.index_cast %7 : i64 to index
+ %9 = flow.dispatch.tensor.load %1, offsets = [%8, 0], sizes = [1, 50], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x50xcomplex<f32>>> -> tensor<50xcomplex<f32>>
+ %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%9 : tensor<50xcomplex<f32>>) outs(%6 : tensor<32x50x2xf32>) {
+ ^bb0(%in: complex<f32>, %out: f32):
+ %11 = linalg.index 0 : index
+ %12 = linalg.index 1 : index
+ %extracted_0 = tensor.extract %5[%c0, %c0, %11, %12, %c0] : tensor<1x1x32x50x2xf32>
+ %extracted_1 = tensor.extract %5[%c0, %c0, %11, %12, %c1] : tensor<1x1x32x50x2xf32>
+ %13 = complex.create %extracted_0, %extracted_1 : complex<f32>
+ %14 = complex.mul %13, %in : complex<f32>
+ %15 = complex.re %14 : complex<f32>
+ %16 = complex.im %14 : complex<f32>
+ %17 = linalg.index 2 : index
+ %18 = arith.cmpi eq, %17, %c0 : index
+ %19 = arith.select %18, %15, %16 : f32
+ linalg.yield %19 : f32
+ } -> tensor<32x50x2xf32>
+ flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0], sizes = [32, 50, 2], strides = [1, 1, 1] : tensor<32x50x2xf32> -> !flow.dispatch.tensor<writeonly:tensor<32x50x2xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 2, 2], [1, 1, 1]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute>
-// CHECK: hal.executable.export public @complex_view_as_real
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [2 : index, 2 : index, 4 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseDistribute workgroup_size = [2, 2, 4]>
// CHECK: func.func @complex_view_as_real()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir
index dee5040..f521960 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir
@@ -1,174 +1,110 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @subgroup_reduce_f32 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniformShuffle], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export public @subgroup_reduce_f32 ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @subgroup_reduce_f32() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x512xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x512xf32>> -> tensor<2x512xf32>
- %3 = tensor.empty() : tensor<2xf32>
- %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2xf32>) -> tensor<2xf32>
- %5 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>],
- iterator_types = ["parallel", "reduction"]
- } ins(%2 : tensor<2x512xf32>) outs(%4 : tensor<2xf32>) {
- ^bb0(%arg0: f32, %arg1: f32):
- %6 = arith.addf %arg1, %arg0 : f32
- linalg.yield %6 : f32
- } -> tensor<2xf32>
- flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !flow.dispatch.tensor<writeonly:tensor<2xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniformShuffle], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+module {
+ func.func @subgroup_reduce_f32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x512xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x512xf32>> -> tensor<2x512xf32>
+ %3 = tensor.empty() : tensor<2xf32>
+ %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2xf32>) -> tensor<2xf32>
+ %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<2x512xf32>) outs(%4 : tensor<2xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %6 = arith.addf %out, %in : f32
+ linalg.yield %6 : f32
+ } -> tensor<2xf32>
+ flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !flow.dispatch.tensor<writeonly:tensor<2xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1], [0, 512]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce>
-// CHECK: hal.executable.export public @subgroup_reduce_f32
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [128 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce workgroup_size = [128, 1, 1]>
// CHECK: func.func @subgroup_reduce_f32()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[CONFIG]]
-
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @subgroup_reduce_f16 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, GroupNonUniformShuffle], []>, Unknown:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>
- }>) {
- hal.executable.export public @subgroup_reduce_f16 ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @subgroup_reduce_f16() {
- %cst = arith.constant 0.000000e+00 : f16
- %c0 = arith.constant 0 : index
- %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x4096x4096xf16>>
- %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x4096x4096xf16>>
- %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x4096x4096xf16>> -> tensor<16x4096x4096xf16>
- %7 = tensor.empty() : tensor<16x4096x4096xf16>
- %8 = tensor.empty() : tensor<16x4096xf16>
- %9 = linalg.fill ins(%cst : f16) outs(%8 : tensor<16x4096xf16>) -> tensor<16x4096xf16>
- %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<16x4096x4096xf16>) outs(%9 : tensor<16x4096xf16>) {
- ^bb0(%in: f16, %out: f16):
- %12 = arith.addf %in, %out : f16
- linalg.yield %12 : f16
- } -> tensor<16x4096xf16>
- %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%6, %10 : tensor<16x4096x4096xf16>, tensor<16x4096xf16>) outs(%7 : tensor<16x4096x4096xf16>) {
- ^bb0(%in: f16, %in_0: f16, %out: f16):
- %12 = arith.divf %in, %in_0 : f16
- linalg.yield %12 : f16
- } -> tensor<16x4096x4096xf16>
- flow.dispatch.tensor.store %11, %5, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : tensor<16x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<16x4096x4096xf16>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, GroupNonUniformShuffle], []>, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module {
+ func.func @subgroup_reduce_f16() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %cst = arith.constant 0.000000e+00 : f16
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x4096x4096xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x4096x4096xf16>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x4096x4096xf16>> -> tensor<16x4096x4096xf16>
+ %3 = tensor.empty() : tensor<16x4096x4096xf16>
+ %4 = tensor.empty() : tensor<16x4096xf16>
+ %5 = linalg.fill ins(%cst : f16) outs(%4 : tensor<16x4096xf16>) -> tensor<16x4096xf16>
+ %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<16x4096x4096xf16>) outs(%5 : tensor<16x4096xf16>) {
+ ^bb0(%in: f16, %out: f16):
+ %8 = arith.addf %in, %out : f16
+ linalg.yield %8 : f16
+ } -> tensor<16x4096xf16>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2, %6 : tensor<16x4096x4096xf16>, tensor<16x4096xf16>) outs(%3 : tensor<16x4096x4096xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %out: f16):
+ %8 = arith.divf %in, %in_0 : f16
+ linalg.yield %8 : f16
+ } -> tensor<16x4096x4096xf16>
+ flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : tensor<16x4096x4096xf16> -> !flow.dispatch.tensor<writeonly:tensor<16x4096x4096xf16>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1], [0, 0, 512]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce>
-// CHECK: hal.executable.export public @subgroup_reduce_f16
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce workgroup_size = [64, 1, 1]>
// CHECK: func.func @subgroup_reduce_f16()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 2, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer, ReadOnly>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-
-hal.executable private @subgroup_reduce_dynamic {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, GroupNonUniformShuffle], []>, api=Vulkan, Unknown:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>
- }>) {
- hal.executable.export public @subgroup_reduce_dynamic ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @subgroup_reduce_dynamic() {
- %c32_i64 = arith.constant 32 : i64
- %cst = arith.constant 0.000000e+00 : f32
- %cst_0 = arith.constant 2.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = arith.extui %0 : i32 to i64
- %3 = arith.extui %1 : i32 to i64
- %4 = arith.shli %3, %c32_i64 : i64
- %5 = arith.ori %2, %4 : i64
- %6 = arith.index_castui %5 : i64 to index
- %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8xf32>>
- %8 = flow.dispatch.workload.ordinal %6, 0 : index
- %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x?xf32>>{%8}
- %10 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [8, %8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8x?xf32>>{%8} -> tensor<8x?xf32>
- %11 = tensor.empty() : tensor<8xf32>
- %12 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1], [0, 64]]>} ins(%cst : f32) outs(%11 : tensor<8xf32>) -> tensor<8xf32>
- %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%10 : tensor<8x?xf32>) outs(%12 : tensor<8xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1], [0, 64]]>} {
- ^bb0(%in: f32, %out: f32):
- %14 = math.powf %in, %cst_0 : f32
- %15 = arith.addf %14, %out : f32
- linalg.yield %15 : f32
- } -> tensor<8xf32>
- flow.dispatch.tensor.store %13, %7, offsets = [0], sizes = [8], strides = [1] : tensor<8xf32> -> !flow.dispatch.tensor<writeonly:tensor<8xf32>>
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[1], [0, 64]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, GroupNonUniformShuffle], []>, api=Vulkan, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+module {
+ func.func @subgroup_reduce_dynamic() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c32_i64 = arith.constant 32 : i64
+ %cst = arith.constant 0.000000e+00 : f32
+ %cst_0 = arith.constant 2.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.extui %0 : i32 to i64
+ %3 = arith.extui %1 : i32 to i64
+ %4 = arith.shli %3, %c32_i64 : i64
+ %5 = arith.ori %2, %4 : i64
+ %6 = arith.index_castui %5 : i64 to index
+ %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8xf32>>
+ %8 = flow.dispatch.workload.ordinal %6, 0 : index
+ %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<8x?xf32>>{%8}
+ %10 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [8, %8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8x?xf32>>{%8} -> tensor<8x?xf32>
+ %11 = tensor.empty() : tensor<8xf32>
+ %12 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%11 : tensor<8xf32>) -> tensor<8xf32>
+ %13 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%10 : tensor<8x?xf32>) outs(%12 : tensor<8xf32>) attrs = {lowering_config = #config} {
+ ^bb0(%in: f32, %out: f32):
+ %14 = math.powf %in, %cst_0 : f32
+ %15 = arith.addf %14, %out : f32
+ linalg.yield %15 : f32
+ } -> tensor<8xf32>
+ flow.dispatch.tensor.store %13, %7, offsets = [0], sizes = [8], strides = [1] : tensor<8xf32> -> !flow.dispatch.tensor<writeonly:tensor<8xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1], [0, 64]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce>
-// CHECK: hal.executable.export public @subgroup_reduce_dynamic
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce workgroup_size = [64, 1, 1]>
// CHECK: func.func @subgroup_reduce_dynamic()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_sub_byte_types.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_sub_byte_types.mlir
index ad0433c..9980423 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_sub_byte_types.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_sub_byte_types.mlir
@@ -1,55 +1,35 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-hal.executable @i4_dequant {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @i4_dequant layout(#pipeline_layout)
- builtin.module {
- func.func @i4_dequant() {
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<131072x128xi4>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<131072xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<131072xf32>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<131072x128xf32>>
- %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [131072, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<131072x128xi4>> -> tensor<131072x128xi4>
- %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xf32>> -> tensor<131072xf32>
- %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xf32>> -> tensor<131072xf32>
- %7 = tensor.empty() : tensor<131072x128xf32>
- %8 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]
- } ins(%4, %5, %6 : tensor<131072x128xi4>, tensor<131072xf32>, tensor<131072xf32>) outs(%7 : tensor<131072x128xf32>) {
- ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
- %9 = arith.extui %in : i4 to i32
- %10 = arith.uitofp %9 : i32 to f32
- %11 = arith.subf %10, %in_1 : f32
- %12 = arith.mulf %11, %in_0 : f32
- linalg.yield %12 : f32
- } -> tensor<131072x128xf32>
- flow.dispatch.tensor.store %8, %3, offsets = [0, 0], sizes = [131072, 128], strides = [1, 1] : tensor<131072x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<131072x128xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+module {
+ func.func @i4_dequant() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<131072x128xi4>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<131072xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<131072xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<131072x128xf32>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [131072, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<131072x128xi4>> -> tensor<131072x128xi4>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xf32>> -> tensor<131072xf32>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor<readonly:tensor<131072xf32>> -> tensor<131072xf32>
+ %7 = tensor.empty() : tensor<131072x128xf32>
+ %8 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%4, %5, %6 : tensor<131072x128xi4>, tensor<131072xf32>, tensor<131072xf32>) outs(%7 : tensor<131072x128xf32>) {
+ ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
+ %9 = arith.extui %in : i4 to i32
+ %10 = arith.uitofp %9 : i32 to f32
+ %11 = arith.subf %10, %in_1 : f32
+ %12 = arith.mulf %11, %in_0 : f32
+ linalg.yield %12 : f32
+ } -> tensor<131072x128xf32>
+ flow.dispatch.tensor.store %8, %3, offsets = [0, 0], sizes = [131072, 128], strides = [1, 1] : tensor<131072x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<131072x128xf32>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[8, 128], [2, 8]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK-LABEL: hal.executable.export public @i4_dequant
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [16 : index, 4 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [16, 4, 1]>
+// CHECK: func.func @i4_dequant()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir
index 5cb4a52..0186b14 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir
@@ -1,270 +1,154 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
// Conv - large OC - distribute to only one workgroup dimension.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @conv_112x112x512 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export public @conv_112x112x512 layout(#pipeline_layout)
- builtin.module {
- func.func @conv_112x112x512() {
- %c0 = arith.constant 0 : index
- %c512 = arith.constant 512 : index
- %c112 = arith.constant 112 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x3x512xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x512xf32>>
- %13 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>> -> tensor<1x225x225x3xf32>
- %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 512], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x3x512xf32>> -> tensor<3x3x3x512xf32>
- %22 = tensor.empty() : tensor<1x112x112x512xf32>
- %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
- %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%13, %15 : tensor<1x225x225x3xf32>, tensor<3x3x3x512xf32>)
- outs(%23 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
- flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 512], strides = [1, 1, 1, 1]
- : tensor<1x112x112x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x512xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @conv_112x112x512() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c512 = arith.constant 512 : index
+ %c112 = arith.constant 112 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x3x512xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x512xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>> -> tensor<1x225x225x3xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 512], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x3x512xf32>> -> tensor<3x3x3x512xf32>
+ %5 = tensor.empty() : tensor<1x112x112x512xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
+ %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x512xf32>) outs(%6 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 512], strides = [1, 1, 1, 1] : tensor<1x112x112x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x512xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 1, 4, 64], [1, 1, 4, 4], [0, 0, 0, 0, 1, 1, 4], [0, 1, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @conv_112x112x512
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [16 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [16, 1, 1]>
// CHECK: func.func @conv_112x112x512()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.conv_2d_nhwc_hwcf
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Conv - medium OC/OW/OH - distribute to two workgroup dimensions.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @conv_112x112x32 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export public @conv_112x112x32 layout(#pipeline_layout)
- builtin.module {
- func.func @conv_112x112x32() {
- %c0 = arith.constant 0 : index
- %c32 = arith.constant 32 : index
- %c112 = arith.constant 112 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x3x32xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x32xf32>>
- %13 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>> -> tensor<1x225x225x3xf32>
- %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x3x32xf32>> -> tensor<3x3x3x32xf32>
- %22 = tensor.empty() : tensor<1x112x112x32xf32>
- %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
- %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%13, %15 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%23 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
- flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1]
- : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x32xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @conv_112x112x32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c32 = arith.constant 32 : index
+ %c112 = arith.constant 112 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x3x32xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x32xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x3xf32>> -> tensor<1x225x225x3xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x3x32xf32>> -> tensor<3x3x3x32xf32>
+ %5 = tensor.empty() : tensor<1x112x112x32xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
+ %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%6 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x32xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 1, 8, 32], [1, 1, 4, 4], [0, 0, 0, 0, 1, 1, 4], [0, 1, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @conv_112x112x32
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [8 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 2, 1]>
// CHECK: func.func @conv_112x112x32()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.conv_2d_nhwc_hwcf
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Conv - small OC/OW/OH - distribute to all three workgroup dimensions.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @conv_16x16x16 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export public @conv_16x16x16 layout(#pipeline_layout)
- builtin.module {
- func.func @conv_16x16x16() {
- %c0 = arith.constant 0 : index
- %c16 = arith.constant 16 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x33x33x3xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x16x16x16xf32>>
- %13 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 33, 33, 3], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x33x33x3xf32>> -> tensor<1x33x33x3xf32>
- %15 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>> -> tensor<3x3x3x16xf32>
- %22 = tensor.empty() : tensor<1x16x16x16xf32>
- %23 = linalg.fill ins(%cst : f32) outs(%22 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
- %24 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%13, %15 : tensor<1x33x33x3xf32>, tensor<3x3x3x16xf32>) outs(%23 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
- flow.dispatch.tensor.store %24, %2, offsets = [0, 0, 0, 0], sizes = [1, 16, 16, 16], strides = [1, 1, 1, 1]
- : tensor<1x16x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x16x16x16xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @conv_16x16x16() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c16 = arith.constant 16 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x33x33x3xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x16x16x16xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 33, 33, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x33x33x3xf32>> -> tensor<1x33x33x3xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x3x16xf32>> -> tensor<3x3x3x16xf32>
+ %5 = tensor.empty() : tensor<1x16x16x16xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
+ %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x33x33x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 16, 16, 16], strides = [1, 1, 1, 1] : tensor<1x16x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x16x16x16xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 4, 4, 16], [1, 2, 2, 4], [0, 0, 0, 0, 1, 1, 4], [0, 1, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @conv_16x16x16
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [4 : index, 2 : index, 2 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [4, 2, 2]>
// CHECK: func.func @conv_16x16x16()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.conv_2d_nhwc_hwcf
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Depthwise conv - small OC/OW/OH - distribute to all three workgroup dimensions.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @dwconv_28x28x144 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export public @dwconv_28x28x144 layout(#pipeline_layout)
- builtin.module {
- func.func @dwconv_28x28x144() {
- %c0 = arith.constant 0 : index
- %c144 = arith.constant 144 : index
- %c28 = arith.constant 28 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x57x57x144xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x144xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x28x28x144xf32>>
- %14 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [0, 57, 57, 144], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x57x57x144xf32>> -> tensor<1x57x57x144xf32>
- %16 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 144], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x144xf32>> -> tensor<3x3x144xf32>
- %23 = tensor.empty() : tensor<1x28x28x144xf32>
- %24 = linalg.fill ins(%cst : f32) outs(%23 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
- %25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%14, %16 : tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) outs(%24 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
- flow.dispatch.tensor.store %25, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 144], strides = [1, 1, 1, 1]
- : tensor<1x28x28x144xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x28x28x144xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @dwconv_28x28x144() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c144 = arith.constant 144 : index
+ %c28 = arith.constant 28 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x57x57x144xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x144xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x28x28x144xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [0, 57, 57, 144], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x57x57x144xf32>> -> tensor<1x57x57x144xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 144], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x144xf32>> -> tensor<3x3x144xf32>
+ %5 = tensor.empty() : tensor<1x28x28x144xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
+ %7 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) outs(%6 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 144], strides = [1, 1, 1, 1] : tensor<1x28x28x144xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x28x28x144xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 4, 4, 16], [1, 2, 2, 4], [0, 0, 0, 0, 1, 1], [0, 1, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @dwconv_28x28x144
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [4 : index, 2 : index, 2 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [4, 2, 2]>
// CHECK: func.func @dwconv_28x28x144()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.depthwise_conv_2d_nhwc_hwc
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Depthwise conv - tiny OC/OW/OH - starving the GPU.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @dwconv_1x2x8 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export public @dwconv_1x2x8 layout(#pipeline_layout)
- builtin.module {
- func.func @dwconv_1x2x8() {
- %c0 = arith.constant 0 : index
- %c8 = arith.constant 8 : index
- %c2 = arith.constant 2 : index
- %c1 = arith.constant 1 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x3x5x8xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x8xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x1x2x8xf32>>
- %14 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 3, 5, 8], strides = [1, 1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<1x3x5x8xf32>> -> tensor<1x3x5x8xf32>
- %16 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 8], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<3x3x8xf32>> -> tensor<3x3x8xf32>
- %23 = tensor.empty() : tensor<1x1x2x8xf32>
- %24 = linalg.fill ins(%cst : f32) outs(%23 : tensor<1x1x2x8xf32>) -> tensor<1x1x2x8xf32>
- %25 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
- ins(%14, %16 : tensor<1x3x5x8xf32>, tensor<3x3x8xf32>) outs(%24 : tensor<1x1x2x8xf32>) -> tensor<1x1x2x8xf32>
- flow.dispatch.tensor.store %25, %2, offsets = [0, 0, 0, 0], sizes = [1, 1, 2, 8], strides = [1, 1, 1, 1]
- : tensor<1x1x2x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x1x2x8xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @dwconv_1x2x8() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c8 = arith.constant 8 : index
+ %c2 = arith.constant 2 : index
+ %c1 = arith.constant 1 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x3x5x8xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3x3x8xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x1x2x8xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 3, 5, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x3x5x8xf32>> -> tensor<1x3x5x8xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 8], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x8xf32>> -> tensor<3x3x8xf32>
+ %5 = tensor.empty() : tensor<1x1x2x8xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x1x2x8xf32>) -> tensor<1x1x2x8xf32>
+ %7 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x3x5x8xf32>, tensor<3x3x8xf32>) outs(%6 : tensor<1x1x2x8xf32>) -> tensor<1x1x2x8xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 1, 2, 8], strides = [1, 1, 1, 1] : tensor<1x1x2x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x1x2x8xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 1, 2, 8], [1, 1, 1, 4], [0, 0, 0, 0, 1, 1], [0, 1, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @dwconv_1x2x8
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [2 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [2, 2, 1]>
// CHECK: func.func @dwconv_1x2x8()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.depthwise_conv_2d_nhwc_hwc
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir
index d7e3f52..68b0aaf 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir
@@ -1,608 +1,361 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
// Large matmul that can match the best tiling scheme.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_1024x2048x512 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @matmul_1024x2048x512 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_1024x2048x512() {
- %c0 = arith.constant 0 : index
- %c2048 = arith.constant 2048 : index
- %c1024 = arith.constant 1024 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x2048xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1024x2048xf32>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xf32>> -> tensor<512x2048xf32>
- %15 = tensor.empty() : tensor<1024x2048xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
- %17 = linalg.matmul
- ins(%8, %10 : tensor<1024x512xf32>, tensor<512x2048xf32>) outs(%16 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1]
- : tensor<1024x2048xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x2048xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @matmul_1024x2048x512() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c2048 = arith.constant 2048 : index
+ %c1024 = arith.constant 1024 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x2048xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1024x2048xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xf32>> -> tensor<512x2048xf32>
+ %5 = tensor.empty() : tensor<1024x2048xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<1024x512xf32>, tensor<512x2048xf32>) outs(%6 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1] : tensor<1024x2048xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x2048xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[8, 32], [4, 4], [0, 0, 4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @matmul_1024x2048x512
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [8 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 2, 1]>
// CHECK: func.func @matmul_1024x2048x512()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Small matmul N that can still tile to all threads in a workgroup.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_3136x24x96 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @matmul_3136x24x96 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_3136x24x96() {
- %c0 = arith.constant 0 : index
- %c24 = arith.constant 24 : index
- %c3136 = arith.constant 3136 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3136x96xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<96x24xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<3136x24xf32>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3136, 96], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3136x96xf32>> -> tensor<3136x96xf32>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [96, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<96x24xf32>> -> tensor<96x24xf32>
- %15 = tensor.empty() : tensor<3136x24xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
- %17 = linalg.matmul
- ins(%8, %10 : tensor<3136x96xf32>, tensor<96x24xf32>)
- outs(%16 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [3136, 24], strides = [1, 1]
- : tensor<3136x24xf32> -> !flow.dispatch.tensor<writeonly:tensor<3136x24xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @matmul_3136x24x96() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c24 = arith.constant 24 : index
+ %c3136 = arith.constant 3136 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<3136x96xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<96x24xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<3136x24xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3136, 96], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3136x96xf32>> -> tensor<3136x96xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [96, 24], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<96x24xf32>> -> tensor<96x24xf32>
+ %5 = tensor.empty() : tensor<3136x24xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<3136x96xf32>, tensor<96x24xf32>) outs(%6 : tensor<3136x24xf32>) -> tensor<3136x24xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [3136, 24], strides = [1, 1] : tensor<3136x24xf32> -> !flow.dispatch.tensor<writeonly:tensor<3136x24xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 8], [4, 4], [0, 0, 4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @matmul_3136x24x96
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [2 : index, 8 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [2, 8, 1]>
// CHECK: func.func @matmul_3136x24x96()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Small matmul M that can still tile to all threads in a workgroup.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_196x64x192 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @matmul_196x64x192 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_196x64x192() {
- %c0 = arith.constant 0 : index
- %c64 = arith.constant 64 : index
- %c196 = arith.constant 196 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<196x192xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<192x64xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<196x64xf32>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [196, 192], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<196x192xf32>> -> tensor<196x192xf32>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [192, 64], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<192x64xf32>> -> tensor<192x64xf32>
- %15 = tensor.empty() : tensor<196x64xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<196x64xf32>) -> tensor<196x64xf32>
- %17 = linalg.matmul
- ins(%8, %10 : tensor<196x192xf32>, tensor<192x64xf32>) outs(%16 : tensor<196x64xf32>) -> tensor<196x64xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [196, 64], strides = [1, 1]
- : tensor<196x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<196x64xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @matmul_196x64x192() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c64 = arith.constant 64 : index
+ %c196 = arith.constant 196 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<196x192xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<192x64xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<196x64xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [196, 192], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<196x192xf32>> -> tensor<196x192xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [192, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<192x64xf32>> -> tensor<192x64xf32>
+ %5 = tensor.empty() : tensor<196x64xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<196x64xf32>) -> tensor<196x64xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<196x192xf32>, tensor<192x64xf32>) outs(%6 : tensor<196x64xf32>) -> tensor<196x64xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [196, 64], strides = [1, 1] : tensor<196x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<196x64xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 32], [2, 4], [0, 0, 8]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @matmul_196x64x192
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [8 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 2, 1]>
// CHECK: func.func @matmul_196x64x192()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Small matmul K that can still tile to all threads in a workgroup.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_12544x96x16 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @matmul_12544x96x16 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_12544x96x16() {
- %c0 = arith.constant 0 : index
- %c96 = arith.constant 96 : index
- %c12544 = arith.constant 12544 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12544x16xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x96xf32>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<12544x96xf32>
- linalg.fill ins(%cst : f32) outs(%2 : memref<12544x96xf32>)
- linalg.matmul
- ins(%0, %1 : memref<12544x16xf32>, memref<16x96xf32>) outs(%2 : memref<12544x96xf32>)
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @matmul_12544x96x16() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c96 = arith.constant 96 : index
+ %c12544 = arith.constant 12544 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12544x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x96xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<12544x96xf32>
+ linalg.fill ins(%cst : f32) outs(%2 : memref<12544x96xf32>)
+ linalg.matmul ins(%0, %1 : memref<12544x16xf32>, memref<16x96xf32>) outs(%2 : memref<12544x96xf32>)
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[8, 32], [4, 4], [0, 0, 4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @matmul_12544x96x16
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [8 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 2, 1]>
// CHECK: func.func @matmul_12544x96x16()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Odd matmul M and small N that cannot utilize all threads in a workgroup.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_49x160x576 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @matmul_49x160x576 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_49x160x576() {
- %c0 = arith.constant 0 : index
- %c160 = arith.constant 160 : index
- %c49 = arith.constant 49 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<49x576xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<576x160xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<49x160xf32>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [49, 576], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<49x576xf32>> -> tensor<49x576xf32>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 160], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<576x160xf32>> -> tensor<576x160xf32>
- %15 = tensor.empty() : tensor<49x160xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<49x160xf32>) -> tensor<49x160xf32>
- %17 = linalg.matmul
- ins(%8, %10 : tensor<49x576xf32>, tensor<576x160xf32>) outs(%16 : tensor<49x160xf32>) -> tensor<49x160xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [49, 160], strides = [1, 1]
- : tensor<49x160xf32> -> !flow.dispatch.tensor<writeonly:tensor<49x160xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @matmul_49x160x576() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c160 = arith.constant 160 : index
+ %c49 = arith.constant 49 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<49x576xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<576x160xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<49x160xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [49, 576], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<49x576xf32>> -> tensor<49x576xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 160], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<576x160xf32>> -> tensor<576x160xf32>
+ %5 = tensor.empty() : tensor<49x160xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<49x160xf32>) -> tensor<49x160xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<49x576xf32>, tensor<576x160xf32>) outs(%6 : tensor<49x160xf32>) -> tensor<49x160xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [49, 160], strides = [1, 1] : tensor<49x160xf32> -> !flow.dispatch.tensor<writeonly:tensor<49x160xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 32], [1, 4], [0, 0, 8]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @matmul_49x160x576
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [8 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 1, 1]>
// CHECK: func.func @matmul_49x160x576()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Small matmul M to "shift" parallelism to N.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-hal.executable @matmul_2x1024x576 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @matmul_2x1024x576 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_2x1024x576() {
- %cst = arith.constant 0.000000e+00 : f32
- %cst_0 = arith.constant 3.000000e+00 : f32
- %cst_1 = arith.constant 6.000000e+00 : f32
- %cst_2 = arith.constant 0.166666672 : f32
- %c0 = arith.constant 0 : index
- %c3436864 = arith.constant 3436864 : index
- %c10141312 = arith.constant 10141312 : index
- %c2304 = arith.constant 2304 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x576xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c3436864) : !flow.dispatch.tensor<readonly:tensor<576x1024xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c10141312) : !flow.dispatch.tensor<readonly:tensor<2x1024xf32>>
- %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1024xf32>>
- %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 576], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x576xf32>> -> tensor<2x576xf32>
- %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<576x1024xf32>> -> tensor<576x1024xf32>
- %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1024xf32>> -> tensor<2x1024xf32>
- %7 = tensor.empty() : tensor<2x1024xf32>
- %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1024xf32>) -> tensor<2x1024xf32>
- %9 = linalg.matmul ins(%4, %5 : tensor<2x576xf32>, tensor<576x1024xf32>) outs(%8 : tensor<2x1024xf32>) -> tensor<2x1024xf32>
- flow.dispatch.tensor.store %9, %3, offsets = [0, 0], sizes = [2, 1024], strides = [1, 1] : tensor<2x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1024xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @matmul_2x1024x576() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %cst_0 = arith.constant 3.000000e+00 : f32
+ %cst_1 = arith.constant 6.000000e+00 : f32
+ %cst_2 = arith.constant 0.166666672 : f32
+ %c0 = arith.constant 0 : index
+ %c3436864 = arith.constant 3436864 : index
+ %c10141312 = arith.constant 10141312 : index
+ %c2304 = arith.constant 2304 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<2x576xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c3436864) : !flow.dispatch.tensor<readonly:tensor<576x1024xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c10141312) : !flow.dispatch.tensor<readonly:tensor<2x1024xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x1024xf32>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 576], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x576xf32>> -> tensor<2x576xf32>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<576x1024xf32>> -> tensor<576x1024xf32>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1024xf32>> -> tensor<2x1024xf32>
+ %7 = tensor.empty() : tensor<2x1024xf32>
+ %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1024xf32>) -> tensor<2x1024xf32>
+ %9 = linalg.matmul ins(%4, %5 : tensor<2x576xf32>, tensor<576x1024xf32>) outs(%8 : tensor<2x1024xf32>) -> tensor<2x1024xf32>
+ flow.dispatch.tensor.store %9, %3, offsets = [0, 0], sizes = [2, 1024], strides = [1, 1] : tensor<2x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1024xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[2, 128], [2, 4], [0, 0, 8]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @matmul_2x1024x576
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [32, 1, 1]>
// CHECK: func.func @matmul_2x1024x576()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Large matmul with i8 inputs.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_1024x2048x512xi8 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @matmul_1024x2048x512xi8 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_1024x2048x512xi8() {
- %c0 = arith.constant 0 : index
- %c2048 = arith.constant 2048 : index
- %c1024 = arith.constant 1024 : index
- %cst = arith.constant 0 : i32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1024x512xi8>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x2048xi8>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1024x2048xi32>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xi8>> -> tensor<1024x512xi8>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi8>> -> tensor<512x2048xi8>
- %15 = tensor.empty() : tensor<1024x2048xi32>
- %16 = linalg.fill ins(%cst : i32) outs(%15 : tensor<1024x2048xi32>) -> tensor<1024x2048xi32>
- %17 = linalg.matmul
- ins(%8, %10 : tensor<1024x512xi8>, tensor<512x2048xi8>) outs(%16 : tensor<1024x2048xi32>) -> tensor<1024x2048xi32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1]
- : tensor<1024x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x2048xi32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @matmul_1024x2048x512xi8() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c2048 = arith.constant 2048 : index
+ %c1024 = arith.constant 1024 : index
+ %c0_i32 = arith.constant 0 : i32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1024x512xi8>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x2048xi8>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1024x2048xi32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xi8>> -> tensor<1024x512xi8>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x2048xi8>> -> tensor<512x2048xi8>
+ %5 = tensor.empty() : tensor<1024x2048xi32>
+ %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x2048xi32>) -> tensor<1024x2048xi32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<1024x512xi8>, tensor<512x2048xi8>) outs(%6 : tensor<1024x2048xi32>) -> tensor<1024x2048xi32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1] : tensor<1024x2048xi32> -> !flow.dispatch.tensor<writeonly:tensor<1024x2048xi32>>
+ return
}
}
-// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[8, 32], [4, 4], [0, 0, 16]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @matmul_1024x2048x512xi8
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [8 : index, 2 : index, 1 : index]
-// CHECK: func.func @matmul_1024x2048x512xi8()
-// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[CONFIG]]
-
// -----
-
-// Large batch matmul.
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @batch_matmul_4x384x384 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @batch_matmul_4x384x384 layout(#pipeline_layout)
- builtin.module {
- func.func @batch_matmul_4x384x384() {
- %c0 = arith.constant 0 : index
- %c384 = arith.constant 384 : index
- %c4 = arith.constant 4 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x384x32xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x32x384xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4x384x384xf32>>
- %11 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 384, 32], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<4x384x32xf32>> -> tensor<4x384x32xf32>
- %14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 384], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<4x32x384xf32>> -> tensor<4x32x384xf32>
- %21 = tensor.empty() : tensor<4x384x384xf32>
- %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
- %23 = linalg.batch_matmul
- ins(%11, %14 : tensor<4x384x32xf32>, tensor<4x32x384xf32>)
- outs(%22 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
- flow.dispatch.tensor.store %23, %2, offsets = [0, 0, 0], sizes = [4, 384, 384], strides = [1, 1, 1]
- : tensor<4x384x384xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x384x384xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @batch_matmul_4x384x384() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c384 = arith.constant 384 : index
+ %c4 = arith.constant 4 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x384x32xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x32x384xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4x384x384xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 384, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x384x32xf32>> -> tensor<4x384x32xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 384], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x32x384xf32>> -> tensor<4x32x384xf32>
+ %5 = tensor.empty() : tensor<4x384x384xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
+ %7 = linalg.batch_matmul ins(%3, %4 : tensor<4x384x32xf32>, tensor<4x32x384xf32>) outs(%6 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [4, 384, 384], strides = [1, 1, 1] : tensor<4x384x384xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x384x384xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 12, 32], [1, 6, 4], [0, 0, 0, 4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @batch_matmul_4x384x384
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [8 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 2, 1]>
// CHECK: func.func @batch_matmul_4x384x384()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.batch_matmul
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Small batch matmul.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @batch_matmul_4x2x8 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @batch_matmul_4x2x8 layout(#pipeline_layout)
- builtin.module {
- func.func @batch_matmul_4x2x8() {
- %c0 = arith.constant 0 : index
- %c8 = arith.constant 8 : index
- %c2 = arith.constant 2 : index
- %c4 = arith.constant 4 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x2x32xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x32x8xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4x2x8xf32>>
- %11 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 2, 32], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<4x2x32xf32>> -> tensor<4x2x32xf32>
- %14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 8], strides = [1, 1, 1]
- : !flow.dispatch.tensor<readonly:tensor<4x32x8xf32>> -> tensor<4x32x8xf32>
- %21 = tensor.empty() : tensor<4x2x8xf32>
- %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<4x2x8xf32>) -> tensor<4x2x8xf32>
- %23 = linalg.batch_matmul
- ins(%11, %14 : tensor<4x2x32xf32>, tensor<4x32x8xf32>) outs(%22 : tensor<4x2x8xf32>) -> tensor<4x2x8xf32>
- flow.dispatch.tensor.store %23, %2, offsets = [0, 0, 0], sizes = [4, 2, 8], strides = [1, 1, 1]
- : tensor<4x2x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x2x8xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+module {
+ func.func @batch_matmul_4x2x8() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c8 = arith.constant 8 : index
+ %c2 = arith.constant 2 : index
+ %c4 = arith.constant 4 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x2x32xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4x32x8xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4x2x8xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 2, 32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x2x32xf32>> -> tensor<4x2x32xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 8], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4x32x8xf32>> -> tensor<4x32x8xf32>
+ %5 = tensor.empty() : tensor<4x2x8xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4x2x8xf32>) -> tensor<4x2x8xf32>
+ %7 = linalg.batch_matmul ins(%3, %4 : tensor<4x2x32xf32>, tensor<4x32x8xf32>) outs(%6 : tensor<4x2x8xf32>) -> tensor<4x2x8xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [4, 2, 8], strides = [1, 1, 1] : tensor<4x2x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x2x8xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 2, 8], [1, 1, 4], [0, 0, 0, 8]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @batch_matmul_4x2x8
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [2 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [2, 2, 1]>
// CHECK: func.func @batch_matmul_4x2x8()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.batch_matmul
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Linalg.generic that is a batch matmul.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @generic_batch_matmul_32x2x512 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @generic_batch_matmul_32x2x512 layout(#pipeline_layout)
- builtin.module {
- func.func @generic_batch_matmul_32x2x512() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<8x32x64xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<32x64x512xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x8x512xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x32x64xf32>> -> tensor<8x32x64xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [32, 64, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x64x512xf32>> -> tensor<32x64x512xf32>
- %5 = tensor.empty() : tensor<32x8x512xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<32x8x512xf32>) -> tensor<32x8x512xf32>
- %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<8x32x64xf32>, tensor<32x64x512xf32>) outs(%6 : tensor<32x8x512xf32>) attrs = {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]} {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
- %8 = arith.mulf %arg0, %arg1 : f32
- %9 = arith.addf %arg2, %8 : f32
- linalg.yield %9 : f32
- } -> tensor<32x8x512xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [32, 8, 512], strides = [1, 1, 1] : tensor<32x8x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<32x8x512xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+#map = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map4 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map5 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module {
+ func.func @generic_batch_matmul_32x2x512() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<8x32x64xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<32x64x512xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x8x512xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x32x64xf32>> -> tensor<8x32x64xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [32, 64, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x64x512xf32>> -> tensor<32x64x512xf32>
+ %5 = tensor.empty() : tensor<32x8x512xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<32x8x512xf32>) -> tensor<32x8x512xf32>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<8x32x64xf32>, tensor<32x64x512xf32>) outs(%6 : tensor<32x8x512xf32>) attrs = {linalg.memoized_indexing_maps = [#map3, #map4, #map5]} {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %8 = arith.mulf %in, %in_0 : f32
+ %9 = arith.addf %out, %8 : f32
+ linalg.yield %9 : f32
+ } -> tensor<32x8x512xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [32, 8, 512], strides = [1, 1, 1] : tensor<32x8x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<32x8x512xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 8, 32], [1, 4, 4], [0, 0, 0, 4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @generic_batch_matmul_32x2x512
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [8 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 2, 1]>
// CHECK: func.func @generic_batch_matmul_32x2x512()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Linalg.generic that is a batch matmul.
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
-#map3 = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
-#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-#map5 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-
-hal.executable @generic_batch_matmul_8x2500x512x4608 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 16>>
- }>) {
- hal.executable.export @generic_batch_matmul_8x2500x512x4608 layout(#pipeline_layout)
- builtin.module {
- func.func @generic_batch_matmul_8x2500x512x4608() {
- %c168607744 = arith.constant 168607744 : index
- %c537247744 = arith.constant 537247744 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c168607744) : !flow.dispatch.tensor<readonly:tensor<8x2500x4608xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4608x512xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c537247744) : !flow.dispatch.tensor<readonly:tensor<8x2500x512xf32>>
- %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<8x2500x512xf32>>
- %4 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8x2500x512xf32>>
- %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [8, 2500, 4608], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x2500x4608xf32>> -> tensor<8x2500x4608xf32>
- %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4608, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4608x512xf32>> -> tensor<4608x512xf32>
- %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [8, 2500, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x2500x512xf32>> -> tensor<8x2500x512xf32>
- %8 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [8, 2500, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x2500x512xf32>> -> tensor<8x2500x512xf32>
- %9 = tensor.empty() : tensor<8x2500x512xf32>
- %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x2500x512xf32>) -> tensor<8x2500x512xf32>
- %11 = linalg.generic {
- indexing_maps = [#map2, #map3, #map4],
- iterator_types = ["parallel", "parallel", "parallel", "reduction"]
- } ins(%5, %6 : tensor<8x2500x4608xf32>, tensor<4608x512xf32>) outs(%10 : tensor<8x2500x512xf32>) {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
- %13 = arith.mulf %arg0, %arg1 : f32
- %14 = arith.addf %13, %arg2 : f32
- linalg.yield %14 : f32
- } -> tensor<8x2500x512xf32>
- %12 = linalg.generic {
- indexing_maps = [#map5, #map5, #map5, #map5],
- iterator_types = ["parallel", "parallel", "parallel"]
- } ins(%11, %7, %8 : tensor<8x2500x512xf32>, tensor<8x2500x512xf32>, tensor<8x2500x512xf32>) outs(%9 : tensor<8x2500x512xf32>) {
- ^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32):
- %13 = arith.addf %arg0, %arg1 : f32
- %14 = arith.subf %13, %arg2 : f32
- linalg.yield %14 : f32
- } -> tensor<8x2500x512xf32>
- flow.dispatch.tensor.store %12, %4, offsets = [0, 0, 0], sizes = [8, 2500, 512], strides = [1, 1, 1] : tensor<8x2500x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<8x2500x512xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, ARM:IntegratedGPU, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 16>>}>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+module {
+ func.func @generic_batch_matmul_8x2500x512x4608() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c168607744 = arith.constant 168607744 : index
+ %c537247744 = arith.constant 537247744 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c168607744) : !flow.dispatch.tensor<readonly:tensor<8x2500x4608xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4608x512xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c537247744) : !flow.dispatch.tensor<readonly:tensor<8x2500x512xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<8x2500x512xf32>>
+ %4 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<8x2500x512xf32>>
+ %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [8, 2500, 4608], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x2500x4608xf32>> -> tensor<8x2500x4608xf32>
+ %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4608, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4608x512xf32>> -> tensor<4608x512xf32>
+ %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [8, 2500, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x2500x512xf32>> -> tensor<8x2500x512xf32>
+ %8 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [8, 2500, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<8x2500x512xf32>> -> tensor<8x2500x512xf32>
+ %9 = tensor.empty() : tensor<8x2500x512xf32>
+ %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x2500x512xf32>) -> tensor<8x2500x512xf32>
+ %11 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%5, %6 : tensor<8x2500x4608xf32>, tensor<4608x512xf32>) outs(%10 : tensor<8x2500x512xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %13 = arith.mulf %in, %in_0 : f32
+ %14 = arith.addf %13, %out : f32
+ linalg.yield %14 : f32
+ } -> tensor<8x2500x512xf32>
+ %12 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%11, %7, %8 : tensor<8x2500x512xf32>, tensor<8x2500x512xf32>, tensor<8x2500x512xf32>) outs(%9 : tensor<8x2500x512xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %in_1: f32, %out: f32):
+ %13 = arith.addf %in, %in_0 : f32
+ %14 = arith.subf %13, %in_1 : f32
+ linalg.yield %14 : f32
+ } -> tensor<8x2500x512xf32>
+ flow.dispatch.tensor.store %12, %4, offsets = [0, 0, 0], sizes = [8, 2500, 512], strides = [1, 1, 1] : tensor<8x2500x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<8x2500x512xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 10, 32], [1, 5, 4], [0, 0, 0, 4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @generic_batch_matmul_8x2500x512x4608
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [8 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 2, 1]>
// CHECK: func.func @generic_batch_matmul_8x2500x512x4608()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul.mlir
index 46f5e8e..430750f 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul.mlir
@@ -1,155 +1,98 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_4x4096x9216 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.5, [Shader], []>, NVIDIA:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @matmul_4x4096x9216 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_4x4096x9216() {
- %c36864 = arith.constant 36864 : index
- %c667974912 = arith.constant 667974912 : index
- %c209920 = arith.constant 209920 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4x9216xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c209920) : !flow.dispatch.tensor<readonly:tensor<9216x4096xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c667974912) : !flow.dispatch.tensor<readonly:tensor<4x4096xf32>>
- %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c36864) : !flow.dispatch.tensor<writeonly:tensor<4x4096xf32>>
- %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 9216], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x9216xf32>> -> tensor<4x9216xf32>
- %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [9216, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<9216x4096xf32>> -> tensor<9216x4096xf32>
- %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x4096xf32>> -> tensor<4x4096xf32>
- %8 = linalg.matmul ins(%4, %5 : tensor<4x9216xf32>, tensor<9216x4096xf32>) outs(%6 : tensor<4x4096xf32>) -> tensor<4x4096xf32>
- flow.dispatch.tensor.store %8, %3, offsets = [0, 0], sizes = [4, 4096], strides = [1, 1] : tensor<4x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x4096xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.5, [Shader], []>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64]>>}>
+module {
+ func.func @matmul_4x4096x9216() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c36864 = arith.constant 36864 : index
+ %c667974912 = arith.constant 667974912 : index
+ %c209920 = arith.constant 209920 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4x9216xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c209920) : !flow.dispatch.tensor<readonly:tensor<9216x4096xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c667974912) : !flow.dispatch.tensor<readonly:tensor<4x4096xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c36864) : !flow.dispatch.tensor<writeonly:tensor<4x4096xf32>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 9216], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x9216xf32>> -> tensor<4x9216xf32>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [9216, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<9216x4096xf32>> -> tensor<9216x4096xf32>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x4096xf32>> -> tensor<4x4096xf32>
+ %7 = linalg.matmul ins(%4, %5 : tensor<4x9216xf32>, tensor<9216x4096xf32>) outs(%6 : tensor<4x4096xf32>) -> tensor<4x4096xf32>
+ flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [4, 4096], strides = [1, 1] : tensor<4x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x4096xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 128, 32]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
-// CHECK: hal.executable.export public @matmul_4x4096x9216
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 4 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [32, 4, 1], {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
// CHECK: func.func @matmul_4x4096x9216()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Matvec does not go down matmul pipelines.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @matmul_1x4096x9216 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.5, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, NVIDIA:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @matmul_1x4096x9216 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_1x4096x9216() {
- %c36864 = arith.constant 36864 : index
- %c667974912 = arith.constant 667974912 : index
- %c209920 = arith.constant 209920 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x9216xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c209920) : !flow.dispatch.tensor<readonly:tensor<9216x4096xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c667974912) : !flow.dispatch.tensor<readonly:tensor<1x4096xf32>>
- %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c36864) : !flow.dispatch.tensor<writeonly:tensor<1x4096xf32>>
- %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 9216], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x9216xf32>> -> tensor<1x9216xf32>
- %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [9216, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<9216x4096xf32>> -> tensor<9216x4096xf32>
- %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf32>> -> tensor<1x4096xf32>
- %8 = linalg.matmul ins(%4, %5 : tensor<1x9216xf32>, tensor<9216x4096xf32>) outs(%6 : tensor<1x4096xf32>) -> tensor<1x4096xf32>
- flow.dispatch.tensor.store %8, %3, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : tensor<1x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x4096xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.5, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64]>>}>
+module {
+ func.func @matmul_1x4096x9216() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c36864 = arith.constant 36864 : index
+ %c667974912 = arith.constant 667974912 : index
+ %c209920 = arith.constant 209920 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x9216xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c209920) : !flow.dispatch.tensor<readonly:tensor<9216x4096xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c667974912) : !flow.dispatch.tensor<readonly:tensor<1x4096xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c36864) : !flow.dispatch.tensor<writeonly:tensor<1x4096xf32>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 9216], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x9216xf32>> -> tensor<1x9216xf32>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [9216, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<9216x4096xf32>> -> tensor<9216x4096xf32>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4096xf32>> -> tensor<1x4096xf32>
+ %7 = linalg.matmul ins(%4, %5 : tensor<1x9216xf32>, tensor<9216x4096xf32>) outs(%6 : tensor<1x4096xf32>) -> tensor<1x4096xf32>
+ flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : tensor<1x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x4096xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 2048], [1, 8], [0, 0, 8]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @matmul_1x4096x9216
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [256 : index, 1 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [256, 1, 1]>
// CHECK: func.func @matmul_1x4096x9216()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
// Multi-reduction-dimension transposed-B matmul.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @multi_reduction_transposed_b_matmul {
- hal.executable.variant public @vulkan_spirv_fb target(#hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.5, [Shader], []>, NVIDIA:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export public @multi_reduction_transposed_b_matmul layout(#pipeline_layout)
- builtin.module {
- func.func @multi_reduction_transposed_b_matmul() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x86x128xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096x2048xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xf32>> -> tensor<4096x86x128xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [2048, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x86x128xf32>> -> tensor<2048x86x128xf32>
- %5 = tensor.empty() : tensor<4096x2048xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4096x2048xf32>) -> tensor<4096x2048xf32>
- %7 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel", "reduction", "reduction"]
- } ins(%3, %4 : tensor<4096x86x128xf32>, tensor<2048x86x128xf32>) outs(%6 : tensor<4096x2048xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %8 = arith.mulf %in, %in_0 : f32
- %9 = arith.addf %out, %8 : f32
- linalg.yield %9 : f32
- } -> tensor<4096x2048xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [4096, 2048], strides = [1, 1] : tensor<4096x2048xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096x2048xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.5, [Shader], []>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64]>>}>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+module {
+ func.func @multi_reduction_transposed_b_matmul() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x86x128xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096x2048xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xf32>> -> tensor<4096x86x128xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [2048, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x86x128xf32>> -> tensor<2048x86x128xf32>
+ %5 = tensor.empty() : tensor<4096x2048xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4096x2048xf32>) -> tensor<4096x2048xf32>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %4 : tensor<4096x86x128xf32>, tensor<2048x86x128xf32>) outs(%6 : tensor<4096x2048xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %8 = arith.mulf %in, %in_0 : f32
+ %9 = arith.addf %out, %8 : f32
+ linalg.yield %9 : f32
+ } -> tensor<4096x2048xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [4096, 2048], strides = [1, 1] : tensor<4096x2048xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096x2048xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 128, 1, 32]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
-// CHECK: hal.executable.export public @multi_reduction_transposed_b_matmul
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [32 : index, 8 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [32, 8, 1], {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
// CHECK: func.func @multi_reduction_transposed_b_matmul()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir
index fbf6010..f67ab1f 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir
@@ -1,366 +1,174 @@
// RUN: iree-opt --split-input-file \
-// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))' \
-// RUN: %s | FileCheck %s
+// RUN: --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | \
+// RUN: FileCheck %s
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [2147483647, 65535, 65535], cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, acc_sat = false, scope = <Subgroup>>]>>}>
#map = affine_map<(d0, d1) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-hal.executable public @matmul_256x1024x128_div_add {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = i8, b_type = i8, c_type = i32, k_size = 32,
- m_size = 8, n_size = 8, result_type = i32, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f32, k_size = 16,
- m_size = 16, n_size = 16, result_type = f32, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [2147483647, 65535, 65535],
- subgroup_size = 32>
- >}>) {
- hal.executable.export public @matmul_256x1024x128_div_add layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_256x1024x128_div_add() {
- %c0 = arith.constant 0 : index
- %c1024 = arith.constant 1024 : index
- %c256 = arith.constant 256 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x128xf16>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x1024xf16>>
- %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
- %11 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>> -> tensor<256x1024xf16>
- %14 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>> -> tensor<256x1024xf16>
- %17 = tensor.empty() : tensor<256x1024xf16>
- %19 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 128], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x128xf16>> -> tensor<256x128xf16>
- %21 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<128x1024xf16>> -> tensor<128x1024xf16>
- %24 = tensor.empty() : tensor<256x1024xf16>
- %25 = linalg.fill ins(%cst : f16) outs(%24 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
- %26 = linalg.matmul ins(%19, %21 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%25 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
- %27 = linalg.generic {
- indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]}
- ins(%26, %11, %14 : tensor<256x1024xf16>, tensor<256x1024xf16>, tensor<256x1024xf16>)
- outs(%17 : tensor<256x1024xf16>) {
- ^bb0(%arg2: f16, %arg3: f16, %arg4: f16, %arg5: f16): // no predecessors
- %28 = arith.divf %arg2, %arg3 : f16
- %29 = arith.addf %28, %arg4 : f16
- linalg.yield %29 : f16
- } -> tensor<256x1024xf16>
- flow.dispatch.tensor.store %27, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : tensor<256x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
- return
- }
- }
+module {
+ func.func @matmul_256x1024x128_div_add() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c1024 = arith.constant 1024 : index
+ %c256 = arith.constant 256 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x128xf16>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x1024xf16>>
+ %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
+ %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>> -> tensor<256x1024xf16>
+ %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x1024xf16>> -> tensor<256x1024xf16>
+ %7 = tensor.empty() : tensor<256x1024xf16>
+ %8 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xf16>> -> tensor<256x128xf16>
+ %9 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x1024xf16>> -> tensor<128x1024xf16>
+ %10 = tensor.empty() : tensor<256x1024xf16>
+ %11 = linalg.fill ins(%cst : f16) outs(%10 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
+ %12 = linalg.matmul ins(%8, %9 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%11 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
+ %13 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%12, %5, %6 : tensor<256x1024xf16>, tensor<256x1024xf16>, tensor<256x1024xf16>) outs(%7 : tensor<256x1024xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
+ %14 = arith.divf %in, %in_0 : f16
+ %15 = arith.addf %14, %in_1 : f16
+ linalg.yield %15 : f16
+ } -> tensor<256x1024xf16>
+ flow.dispatch.tensor.store %13, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [32, 32], [0, 0, 32], [16, 16, 16]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
-//CHECK-LABEL: hal.executable.export public @matmul_256x1024x128_div_add
-// CHECK-SAME: subgroup_size = 32 : index
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
// CHECK: func.func @matmul_256x1024x128_div_add()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
-#map = affine_map<(d0, d1) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-hal.executable public @batch_matmul_16x128x256x512_div {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = i8, b_type = i8, c_type = i32, k_size = 32,
- m_size = 8, n_size = 8, result_type = i32, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f32, k_size = 16,
- m_size = 16, n_size = 16, result_type = f32, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [2147483647, 65535, 65535],
- subgroup_size = 32>
- >}>) {
- hal.executable.export public @batch_matmul_16x128x256x512_div layout(#pipeline_layout)
- builtin.module {
- func.func @batch_matmul_16x128x256x512_div() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x128x512xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x512x256xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x128x256xf16>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x128x256xf16>>
- %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 128, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x128x512xf16>> -> tensor<16x128x512xf16>
- %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x512x256xf16>> -> tensor<16x512x256xf16>
- %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x128x256xf16>> -> tensor<16x128x256xf16>
- %7 = tensor.empty() : tensor<16x128x256xf16>
- %8 = linalg.fill ins(%cst : f16) outs(%7 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16>
- %9 = linalg.batch_matmul ins(%4, %5 : tensor<16x128x512xf16>, tensor<16x512x256xf16>) outs(%8 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16>
- %10 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%9, %6 : tensor<16x128x256xf16>, tensor<16x128x256xf16>) outs(%7 : tensor<16x128x256xf16>) {
- ^bb0(%in: f16, %in_0: f16, %out: f16):
- %11 = arith.divf %in, %in_0 : f16
- linalg.yield %11 : f16
- } -> tensor<16x128x256xf16>
- flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : tensor<16x128x256xf16> -> !flow.dispatch.tensor<writeonly:tensor<16x128x256xf16>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [2147483647, 65535, 65535], cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, acc_sat = false, scope = <Subgroup>>]>>}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+module {
+ func.func @batch_matmul_16x128x256x512_div() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x128x512xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x512x256xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x128x256xf16>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x128x256xf16>>
+ %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 128, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x128x512xf16>> -> tensor<16x128x512xf16>
+ %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x512x256xf16>> -> tensor<16x512x256xf16>
+ %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x128x256xf16>> -> tensor<16x128x256xf16>
+ %7 = tensor.empty() : tensor<16x128x256xf16>
+ %8 = linalg.fill ins(%cst : f16) outs(%7 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16>
+ %9 = linalg.batch_matmul ins(%4, %5 : tensor<16x128x512xf16>, tensor<16x512x256xf16>) outs(%8 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16>
+ %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%9, %6 : tensor<16x128x256xf16>, tensor<16x128x256xf16>) outs(%7 : tensor<16x128x256xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %out: f16):
+ %11 = arith.divf %in, %in_0 : f16
+ linalg.yield %11 : f16
+ } -> tensor<16x128x256xf16>
+ flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : tensor<16x128x256xf16> -> !flow.dispatch.tensor<writeonly:tensor<16x128x256xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 64], [1, 32, 32], [0, 0, 0, 32], [1, 16, 16, 16]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
-//CHECK-LABEL: hal.executable.export public @batch_matmul_16x128x256x512_div
-// CHECK-SAME: subgroup_size = 32 : index
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
// CHECK: func.func @batch_matmul_16x128x256x512_div()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.batch_matmul
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
// Linalg.generic that is a batch matmul.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable @generic_batch_matmul_32x8x512x64 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = i8, b_type = i8, c_type = i32, k_size = 32,
- m_size = 8, n_size = 8, result_type = i32, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f32, k_size = 16,
- m_size = 16, n_size = 16, result_type = f32, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [2147483647, 65535, 65535],
- subgroup_size = 32>
- >}>) {
- hal.executable.export @generic_batch_matmul_32x8x512x64 layout(#pipeline_layout)
- builtin.module {
- func.func @generic_batch_matmul_32x8x512x64() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x32x64xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<32x64x512xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x128x512xf16>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x32x64xf16>> -> tensor<128x32x64xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [32, 64, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x64x512xf16>> -> tensor<32x64x512xf16>
- %5 = tensor.empty() : tensor<32x128x512xf16>
- %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<32x128x512xf16>) -> tensor<32x128x512xf16>
- %7 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
- ins(%3, %4 : tensor<128x32x64xf16>, tensor<32x64x512xf16>) outs(%6 : tensor<32x128x512xf16>)
- attrs = {linalg.memoized_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]} {
- ^bb0(%arg0: f16, %arg1: f16, %arg2: f16):
- %8 = arith.mulf %arg0, %arg1 : f16
- %9 = arith.addf %arg2, %8 : f16
- linalg.yield %9 : f16
- } -> tensor<32x128x512xf16>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [32, 128, 512], strides = [1, 1, 1] : tensor<32x128x512xf16> -> !flow.dispatch.tensor<writeonly:tensor<32x128x512xf16>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [2147483647, 65535, 65535], cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, acc_sat = false, scope = <Subgroup>>]>>}>
+#map = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map4 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map5 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module {
+ func.func @generic_batch_matmul_32x8x512x64() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x32x64xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<32x64x512xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x128x512xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<128x32x64xf16>> -> tensor<128x32x64xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [32, 64, 512], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x64x512xf16>> -> tensor<32x64x512xf16>
+ %5 = tensor.empty() : tensor<32x128x512xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<32x128x512xf16>) -> tensor<32x128x512xf16>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<128x32x64xf16>, tensor<32x64x512xf16>) outs(%6 : tensor<32x128x512xf16>) attrs = {linalg.memoized_indexing_maps = [#map3, #map4, #map5]} {
+ ^bb0(%in: f16, %in_0: f16, %out: f16):
+ %8 = arith.mulf %in, %in_0 : f16
+ %9 = arith.addf %out, %8 : f16
+ linalg.yield %9 : f16
+ } -> tensor<32x128x512xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [32, 128, 512], strides = [1, 1, 1] : tensor<32x128x512xf16> -> !flow.dispatch.tensor<writeonly:tensor<32x128x512xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 64], [1, 32, 32], [0, 0, 0, 32], [1, 16, 16, 16]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
-//CHECK-LABEL: hal.executable.export public @generic_batch_matmul_32x8x512x64
-// CHECK-SAME: subgroup_size = 32 : index
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
// CHECK: func.func @generic_batch_matmul_32x8x512x64()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.generic
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
// K dim size not divisble by 32.
-#map = affine_map<(d0, d1) -> (d0, d1)>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-hal.executable public @batch_matmul_16x1024x1024x80 {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = i8, b_type = i8, c_type = i32, k_size = 32,
- m_size = 8, n_size = 8, result_type = i32, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f32, k_size = 16,
- m_size = 16, n_size = 16, result_type = f32, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [2147483647, 65535, 65535],
- subgroup_size = 32>
- >}>) {
- hal.executable.export public @batch_matmul_16x1024x1024x80 layout(#pipeline_layout)
- builtin.module {
- func.func @batch_matmul_16x1024x1024x80() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x1024x80xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x80x1024xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x1024x1024xf16>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 1024, 80], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x1024x80xf16>> -> tensor<16x1024x80xf16>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 80, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x80x1024xf16>> -> tensor<16x80x1024xf16>
- %5 = tensor.empty() : tensor<16x1024x1024xf16>
- %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16>
- %7 = linalg.batch_matmul ins(%3, %4 : tensor<16x1024x80xf16>, tensor<16x80x1024xf16>) outs(%6 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [16, 1024, 1024], strides = [1, 1, 1] : tensor<16x1024x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<16x1024x1024xf16>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [2147483647, 65535, 65535], cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, acc_sat = false, scope = <Subgroup>>]>>}>
+module {
+ func.func @batch_matmul_16x1024x1024x80() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x1024x80xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<16x80x1024xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<16x1024x1024xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 1024, 80], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x1024x80xf16>> -> tensor<16x1024x80xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 80, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x80x1024xf16>> -> tensor<16x80x1024xf16>
+ %5 = tensor.empty() : tensor<16x1024x1024xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16>
+ %7 = linalg.batch_matmul ins(%3, %4 : tensor<16x1024x80xf16>, tensor<16x80x1024xf16>) outs(%6 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [16, 1024, 1024], strides = [1, 1, 1] : tensor<16x1024x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<16x1024x1024xf16>>
+ return
}
}
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 64], [1, 32, 32], [0, 0, 0, 16], [1, 16, 16, 16]{{\]}}>
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 0 : i64, store_stage = 0 : i64}>
-//CHECK-LABEL: hal.executable.export public @batch_matmul_16x1024x1024x80
-// CHECK-SAME: subgroup_size = 32 : index
-// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-SAME: workgroup_size = [64 : index, 2 : index, 1 : index]
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 0 : i64}>
// CHECK: func.func @batch_matmul_16x1024x1024x80()
+// CHECK-SAME: translation_info = #[[$TRANSLATION]]
// CHECK: linalg.batch_matmul
-// CHECK-SAME: lowering_config = #[[$CONFIG]]
+// CHECK-SAME: lowering_config = #[[$CONFIG]]
// -----
// Small K - not supported by cooperative matrix.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable public @matmul_256x1024x8 {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = i8, b_type = i8, c_type = i32, k_size = 32,
- m_size = 8, n_size = 8, result_type = i32, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f32, k_size = 16,
- m_size = 16, n_size = 16, result_type = f32, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [2147483647, 65535, 65535],
- subgroup_size = 32>
- >}>) {
- hal.executable.export public @matmul_256x1024x8 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_256x1024x8() {
- %c0 = arith.constant 0 : index
- %c1024 = arith.constant 1024 : index
- %c256 = arith.constant 256 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x8xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<8x1024xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x8xf16>> -> tensor<256x8xf16>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8x1024xf16>> -> tensor<8x1024xf16>
- %15 = tensor.empty() : tensor<256x1024xf16>
- %16 = linalg.fill ins(%cst : f16) outs(%15 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
- %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
- ins(%8, %10 : tensor<256x8xf16>, tensor<8x1024xf16>) outs(%16 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : tensor<256x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [2147483647, 65535, 65535], cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, acc_sat = false, scope = <Subgroup>>]>>}>
+module {
+ func.func @matmul_256x1024x8() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c1024 = arith.constant 1024 : index
+ %c256 = arith.constant 256 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x8xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<8x1024xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x8xf16>> -> tensor<256x8xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<8x1024xf16>> -> tensor<8x1024xf16>
+ %5 = tensor.empty() : tensor<256x1024xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
+ %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<256x8xf16>, tensor<8x1024xf16>) outs(%6 : tensor<256x1024xf16>) -> tensor<256x1024xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
+ return
}
}
-// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
-// CHECK-LABEL: hal.executable.export public @matmul_256x1024x8
-// CHECK-NOT: subgroup_size =
+// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [32, 8, 1], {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
+// CHECK: func.func @matmul_256x1024x8
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
-// CHECK-NOT: subgroup_size =
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_user.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_user.mlir
index 9e6f9d7..84ddf22 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_user.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_user.mlir
@@ -1,54 +1,31 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-spirv-configuration-pipeline)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-codegen-materialize-user-configs, iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[128, 256], [16, 16]]>,
- translation_info = <SPIRVBaseVectorize>,
- workgroup_size = [16, 8, 1], subgroup_size = 64>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable public @user_config {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export public @matmul_128x1024x256 layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_128x1024x256() {
- %cst = arith.constant 0.000000e+00 : f32
- %c128 = arith.constant 128 : index
- %c1024 = arith.constant 1024 : index
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>> -> tensor<256x1024xf32>
- %15 = tensor.empty() : tensor<128x1024xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
- %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup", compilation_info = #compilation}
- ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%16 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[128, 256], [16, 16]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#translation = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [16, 8, 1] subgroup_size = 64>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @matmul_128x1024x256() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c128 = arith.constant 128 : index
+ %c1024 = arith.constant 1024 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<128x256xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xf32>> -> tensor<128x256xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x1024xf32>> -> tensor<256x1024xf32>
+ %5 = tensor.empty() : tensor<128x1024xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
+ %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup", compilation_info = #compilation} ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x1024xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128, 256], [16, 16]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize>
-// CHECK: hal.executable.export public @matmul_128x1024x256
-// CHECK-SAME: subgroup_size = 64 : index
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [16, 8, 1] subgroup_size = 64>
+// CHECK: func.func @matmul_128x1024x256()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [16 : index, 8 : index, 1 : index]
-// CHECK: linalg.matmul
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+// CHECK: linalg.matmul
+// CHECK-SAME: lowering_config = #[[CONFIG]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/emulate_i64.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/emulate_i64.mlir
index 9c25f17..14263f0 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/emulate_i64.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/emulate_i64.mlir
@@ -1,36 +1,20 @@
// RUN: iree-opt --split-input-file \
-// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-spirv-emulate-i64))))' %s | \
+// RUN: --pass-pipeline='builtin.module(func.func(iree-spirv-emulate-i64))' %s | \
// RUN: FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @buffer_types {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader], []>, #spirv.resource_limits<>>}>) {
- hal.executable.export @buffer_types layout(#pipeline_layout) attributes {
- workgroup_size = [32: index, 1: index, 1: index]
- }
- builtin.module {
- func.func @buffer_types() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : i64
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<8xi32, #spirv.storage_class<StorageBuffer>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8xi64, #spirv.storage_class<StorageBuffer>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<8xi64, #spirv.storage_class<StorageBuffer>>
-
- %3 = memref.load %0[%c0] : memref<8xi32, #spirv.storage_class<StorageBuffer>>
- %4 = memref.load %1[%c0] : memref<8xi64, #spirv.storage_class<StorageBuffer>>
- %5 = arith.addi %4, %c1 : i64
- memref.store %5, %2[%c0] : memref<8xi64, #spirv.storage_class<StorageBuffer>>
-
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader], []>, #spirv.resource_limits<>>}>
+module {
+ func.func @buffer_types() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c1_i64 = arith.constant 1 : i64
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<8xi32, #spirv.storage_class<StorageBuffer>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8xi64, #spirv.storage_class<StorageBuffer>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<8xi64, #spirv.storage_class<StorageBuffer>>
+ %3 = memref.load %0[%c0] : memref<8xi32, #spirv.storage_class<StorageBuffer>>
+ %4 = memref.load %1[%c0] : memref<8xi64, #spirv.storage_class<StorageBuffer>>
+ %5 = arith.addi %4, %c1_i64 : i64
+ memref.store %5, %2[%c0] : memref<8xi64, #spirv.storage_class<StorageBuffer>>
+ return
}
}
@@ -46,45 +30,34 @@
// -----
-hal.executable private @emulate_1d_vector {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<>>}>) {
- hal.executable.export public @emulate_1d_vector ordinal(0)
- layout(#hal.pipeline.layout<push_constants = 0,
- sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @emulate_1d_vector() {
- %c95232 = arith.constant 95232 : index
- %c32 = arith.constant 32 : index
- %c0 = arith.constant 0 : index
- %c36864 = arith.constant 36864 : index
- %c1523712 = arith.constant 1523712 : index
- %c96 = arith.constant 96 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<?xvector<4xi32>, #spirv.storage_class<StorageBuffer>>{%c96}
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c1523712) : memref<?xvector<4xi32>, #spirv.storage_class<StorageBuffer>>{%c36864}
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xvector<4xi32>, #spirv.storage_class<StorageBuffer>>{%c36864}
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %3 = gpu.thread_id x
- %4 = arith.muli %workgroup_id_x, %c32 : index
- %5 = arith.addi %3, %4 : index
- %6 = memref.load %0[%5] : memref<?xvector<4xi32>, #spirv.storage_class<StorageBuffer>>
- %7 = arith.extsi %6 : vector<4xi32> to vector<4xi64>
- %8 = arith.extui %6 : vector<4xi32> to vector<4xi64>
- %9 = arith.muli %7, %8 : vector<4xi64>
- %10 = arith.addi %7, %9 : vector<4xi64>
- %11 = arith.trunci %10 : vector<4xi64> to vector<4xi32>
- %12 = arith.muli %workgroup_id_y, %c96 : index
- %13 = arith.addi %5, %12 : index
- %14 = arith.addi %13, %c95232 : index
- memref.store %11, %2[%14] : memref<?xvector<4xi32>, #spirv.storage_class<StorageBuffer>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<>>}>
+module {
+ func.func @emulate_1d_vector() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c95232 = arith.constant 95232 : index
+ %c32 = arith.constant 32 : index
+ %c0 = arith.constant 0 : index
+ %c36864 = arith.constant 36864 : index
+ %c1523712 = arith.constant 1523712 : index
+ %c96 = arith.constant 96 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<?xvector<4xi32>, #spirv.storage_class<StorageBuffer>>{%c96}
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c1523712) : memref<?xvector<4xi32>, #spirv.storage_class<StorageBuffer>>{%c36864}
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<?xvector<4xi32>, #spirv.storage_class<StorageBuffer>>{%c36864}
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %thread_id_x = gpu.thread_id x
+ %3 = arith.muli %workgroup_id_x, %c32 : index
+ %4 = arith.addi %thread_id_x, %3 : index
+ %5 = memref.load %0[%4] : memref<?xvector<4xi32>, #spirv.storage_class<StorageBuffer>>
+ %6 = arith.extsi %5 : vector<4xi32> to vector<4xi64>
+ %7 = arith.extui %5 : vector<4xi32> to vector<4xi64>
+ %8 = arith.muli %6, %7 : vector<4xi64>
+ %9 = arith.addi %6, %8 : vector<4xi64>
+ %10 = arith.trunci %9 : vector<4xi64> to vector<4xi32>
+ %11 = arith.muli %workgroup_id_y, %c96 : index
+ %12 = arith.addi %4, %11 : index
+ %13 = arith.addi %12, %c95232 : index
+ memref.store %10, %2[%13] : memref<?xvector<4xi32>, #spirv.storage_class<StorageBuffer>>
+ return
}
}
@@ -99,36 +72,19 @@
// CHECK: return
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @no_emulation {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, Int64], []>, #spirv.resource_limits<>>}>) {
- hal.executable.export @no_emulation layout(#pipeline_layout) attributes {
- workgroup_size = [32: index, 1: index, 1: index]
- }
- builtin.module {
- func.func @no_emulation() {
- %c0 = arith.constant 0 : index
- %c1 = arith.constant 1 : i64
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<8xi32, #spirv.storage_class<StorageBuffer>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8xi64, #spirv.storage_class<StorageBuffer>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<8xi64, #spirv.storage_class<StorageBuffer>>
-
- %3 = memref.load %0[%c0] : memref<8xi32, #spirv.storage_class<StorageBuffer>>
- %4 = memref.load %1[%c0] : memref<8xi64, #spirv.storage_class<StorageBuffer>>
- %5 = arith.addi %4, %c1 : i64
- memref.store %5, %2[%c0] : memref<8xi64, #spirv.storage_class<StorageBuffer>>
-
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, Int64], []>, #spirv.resource_limits<>>}>
+module {
+ func.func @no_emulation() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c1_i64 = arith.constant 1 : i64
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<8xi32, #spirv.storage_class<StorageBuffer>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8xi64, #spirv.storage_class<StorageBuffer>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<8xi64, #spirv.storage_class<StorageBuffer>>
+ %3 = memref.load %0[%c0] : memref<8xi32, #spirv.storage_class<StorageBuffer>>
+ %4 = memref.load %1[%c0] : memref<8xi64, #spirv.storage_class<StorageBuffer>>
+ %5 = arith.addi %4, %c1_i64 : i64
+ memref.store %5, %2[%c0] : memref<8xi64, #spirv.storage_class<StorageBuffer>>
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir
index ca127da..78e0483 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir
@@ -1,835 +1,448 @@
// RUN: iree-opt \
-// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-spirv-configuration-pipeline)))' \
+// RUN: --pass-pipeline='builtin.module(iree-codegen-materialize-user-configs, iree-spirv-select-lowering-strategy-pass)' \
// RUN: --verify-diagnostics --split-input-file %s
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = []>,
- translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
- workgroup_size = [16, 8, 1]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @illegal layout(#pipeline_layout)
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
- // expected-error @+1 {{expected 1 levels of tiling sizes, got 0}}
- linalg.matmul {compilation_info = #compilation} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
- outs(%result: memref<4x16xf32>)
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = []>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [16, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32>
+ // expected-error @+1 {{expected 1 levels of tiling sizes, got 0}}
+ linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>)
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[32, 64], [4, 4], [0, 0, 4]]>,
- translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @illegal layout(#pipeline_layout)
- // expected-error @+1 {{expected workgroup size to have three dimensions for SPIR-V pipelines}}
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32>
- linalg.matmul {compilation_info = #compilation} ins(%lhs, %rhs : memref<64x16xf32>, memref<16x128xf32>)
- outs(%result: memref<64x128xf32>)
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[32, 64], [4, 4], [0, 0, 4]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ // expected-error @+1 {{expected workgroup size to have three dimensions for SPIR-V pipelines}}
+ func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32>
+ linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>)
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[32, 64], [4, 4], [0, 0, 4]]>,
- translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
- workgroup_size = [16, 8, 128]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @illegal layout(#pipeline_layout)
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32>
- // expected-error @+1 {{expected workgroup size dimensions not exceeding [128, 128, 64]}}
- linalg.matmul {compilation_info = #compilation} ins(%lhs, %rhs : memref<64x16xf32>, memref<16x128xf32>)
- outs(%result: memref<64x128xf32>)
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[32, 64], [4, 4], [0, 0, 4]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [16, 8, 128], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32>
+ // expected-error @+1 {{expected workgroup size dimensions not exceeding [128, 128, 64]}}
+ linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>)
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[32, 64], [4, 2], [0, 0, 4]]>,
- translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
- workgroup_size = [32, 8, 1]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @illegal layout(#pipeline_layout)
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32>
- // expected-error @+1 {{expected total invocation count in workgroup to be <= 128}}
- linalg.matmul {compilation_info = #compilation} ins(%lhs, %rhs : memref<64x16xf32>, memref<16x128xf32>)
- outs(%result: memref<64x128xf32>)
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[32, 64], [4, 2], [0, 0, 4]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [32, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32>
+ // expected-error @+1 {{expected total invocation count in workgroup to be <= 128}}
+ linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>)
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[32, 64], [16, 8], [0, 0, 4]]>,
- translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
- workgroup_size = [8, 2, 1]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @illegal layout(#pipeline_layout)
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32>
- // expected-error @+1 {{expected total workgroup size to be multiple of 32}}
- linalg.matmul {compilation_info = #compilation} ins(%lhs, %rhs : memref<64x16xf32>, memref<16x128xf32>)
- outs(%result: memref<64x128xf32>)
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[32, 64], [16, 8], [0, 0, 4]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [8, 2, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32>
+ // expected-error @+1 {{expected total workgroup size to be multiple of 32}}
+ linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>)
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[32, 60], [4, 4], [0, 0, 4]]>,
- translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
- workgroup_size = [15, 8, 1]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @illegal layout(#pipeline_layout)
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32>
- // expected-error @+1 {{expected each workgroup size dimension to be power of two}}
- linalg.matmul {compilation_info = #compilation} ins(%lhs, %rhs : memref<64x16xf32>, memref<16x128xf32>)
- outs(%result: memref<64x128xf32>)
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[32, 60], [4, 4], [0, 0, 4]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [15, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32>
+ // expected-error @+1 {{expected each workgroup size dimension to be power of two}}
+ linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>)
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[32, 64, 4]]>,
- translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
- workgroup_size = [16, 8, 1]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @illegal layout(#pipeline_layout)
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<48x16xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<48x128xf32>
- // expected-error @+1 {{LHS shape is indivisible by first level tile size}}
- linalg.matmul {compilation_info = #compilation} ins(%lhs, %rhs : memref<48x16xf32>, memref<16x128xf32>)
- outs(%result: memref<48x128xf32>)
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[32, 64, 4]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [16, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<48x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<48x128xf32>
+ // expected-error @+1 {{LHS shape is indivisible by first level tile size}}
+ linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<48x16xf32>, memref<16x128xf32>) outs(%2 : memref<48x128xf32>)
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[32, 64, 4]]>,
- translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
- workgroup_size = [16, 8, 1]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_tensors {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @illegal layout(#pipeline_layout)
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x80xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x80xf32>
- // expected-error @+1 {{RHS shape is indivisible by first level tile size}}
- linalg.matmul {compilation_info = #compilation} ins(%lhs, %rhs : memref<64x16xf32>, memref<16x80xf32>)
- outs(%result: memref<64x80xf32>)
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[32, 64, 4]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [16, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x80xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x80xf32>
+ // expected-error @+1 {{RHS shape is indivisible by first level tile size}}
+ linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x80xf32>) outs(%2 : memref<64x80xf32>)
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[64, 64], [32, 32], [0, 0, 16]]>,
- translation_info = <SPIRVCooperativeMatrixVectorize, {pipeline_depth = 0, store_stage = 1}>,
- workgroup_size = [128, 2, 1], subgroup_size = 64>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable public @matmul_tensor {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64>
- >}>) {
- hal.executable.export public @matmul_tensor layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_tensor() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x32xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x32xf16>> -> tensor<64x32xf16>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
- %15 = tensor.empty() : tensor<64x128xf16>
- %16 = linalg.fill ins(%cst : f16) outs(%15 : tensor<64x128xf16>) -> tensor<64x128xf16>
- // expected-error @+1 {{expected 4 levels of tiling sizes, got 3}}
- %17 = linalg.matmul {compilation_info = #compilation}
- ins(%8, %10 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%16 : tensor<64x128xf16>) -> tensor<64x128xf16>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1]
- : tensor<64x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [32, 32], [0, 0, 16]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>]>>}>
+#translation = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [128, 2, 1] subgroup_size = 64, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x32xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x32xf16>> -> tensor<64x32xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
+ %5 = tensor.empty() : tensor<64x128xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16>
+ // expected-error @+1 {{expected 4 levels of tiling sizes, got 3}}
+ %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
+ return
+ }
+}
+
+// -----
+#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [32, 32], [0, 0, 16], [8, 8, 8]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>]>>}>
+#translation = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [128, 2, 1] subgroup_size = 64, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x32xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x32xf16>> -> tensor<64x32xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
+ %5 = tensor.empty() : tensor<64x128xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16>
+ // expected-error @+1 {{expected the fourth level tile sizes to match cooperative matrix sizes}}
+ %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[64, 64], [32, 32], [0, 0, 16], [8, 8, 8]]>,
- translation_info = <SPIRVCooperativeMatrixVectorize, {pipeline_depth = 0, store_stage = 1}>,
- workgroup_size = [128, 2, 1], subgroup_size = 64>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable public @matmul_tensor {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64>
- >}>) {
- hal.executable.export public @matmul_tensor layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_tensor() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x32xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x32xf16>> -> tensor<64x32xf16>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
- %15 = tensor.empty() : tensor<64x128xf16>
- %16 = linalg.fill ins(%cst : f16) outs(%15 : tensor<64x128xf16>) -> tensor<64x128xf16>
- // expected-error @+1 {{expected the fourth level tile sizes to match cooperative matrix sizes}}
- %17 = linalg.matmul {compilation_info = #compilation}
- ins(%8, %10 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%16 : tensor<64x128xf16>) -> tensor<64x128xf16>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1]
- : tensor<64x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32], [8, 8], [0, 0, 4], [16, 16, 16]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>]>>}>
+#translation = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [256, 4, 1] subgroup_size = 64, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x32xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x32xf16>> -> tensor<64x32xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
+ %5 = tensor.empty() : tensor<64x128xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16>
+ // expected-error @+1 {{expected subgroup tile sizes to be multiple of [16, 16, 16]}}
+ %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[32, 32], [8, 8], [0, 0, 4], [16, 16, 16]]>,
- translation_info = <SPIRVCooperativeMatrixVectorize, {pipeline_depth = 0, store_stage = 1}>,
- workgroup_size = [256, 4, 1], subgroup_size = 64>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable public @matmul_tensor {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64>
- >}>) {
- hal.executable.export public @matmul_tensor layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_tensor() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x32xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x32xf16>> -> tensor<64x32xf16>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
- %15 = tensor.empty() : tensor<64x128xf16>
- %16 = linalg.fill ins(%cst : f16) outs(%15 : tensor<64x128xf16>) -> tensor<64x128xf16>
- // expected-error @+1 {{expected subgroup tile sizes to be multiple of [16, 16, 16]}}
- %17 = linalg.matmul {compilation_info = #compilation}
- ins(%8, %10 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%16 : tensor<64x128xf16>) -> tensor<64x128xf16>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1]
- : tensor<64x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [32, 32], [0, 0, 16], [16, 16, 16]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>]>>}>
+#translation = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [64, 2, 1] subgroup_size = 64, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x32xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x32xf16>> -> tensor<64x32xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
+ %5 = tensor.empty() : tensor<64x128xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16>
+ // expected-error @+1 {{expected workgroup x component equals to (warp_size * wg_tile_n / subgroup_tile_n)}}
+ %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[64, 64], [32, 32], [0, 0, 16], [16, 16, 16]]>,
- translation_info = <SPIRVCooperativeMatrixVectorize, {pipeline_depth = 0, store_stage = 1}>,
- workgroup_size = [64, 2, 1], subgroup_size = 64>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable public @matmul_tensor {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64>
- >}>) {
- hal.executable.export public @matmul_tensor layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_tensor() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x32xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x32xf16>> -> tensor<64x32xf16>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
- %15 = tensor.empty() : tensor<64x128xf16>
- %16 = linalg.fill ins(%cst : f16) outs(%15 : tensor<64x128xf16>) -> tensor<64x128xf16>
- // expected-error @+1 {{expected workgroup x component equals to (warp_size * wg_tile_n / subgroup_tile_n)}}
- %17 = linalg.matmul {compilation_info = #compilation}
- ins(%8, %10 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%16 : tensor<64x128xf16>) -> tensor<64x128xf16>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1]
- : tensor<64x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [32, 32], [0, 0, 16], [16, 16, 16]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>]>>}>
+#translation = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [128, 4, 1] subgroup_size = 64, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x32xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x32xf16>> -> tensor<64x32xf16>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
+ %5 = tensor.empty() : tensor<64x128xf16>
+ %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16>
+ // expected-error @+1 {{expected workgroup y component equals to (wg_tile_m / subgroup_tile_m)}}
+ %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[64, 64], [32, 32], [0, 0, 16], [16, 16, 16]]>,
- translation_info = <SPIRVCooperativeMatrixVectorize, {pipeline_depth = 0, store_stage = 1}>,
- workgroup_size = [128, 4, 1], subgroup_size = 64>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable public @matmul_tensor {
- hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, AMD:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64, min_subgroup_size = 32, max_subgroup_size = 64>
- >}>) {
- hal.executable.export public @matmul_tensor layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_tensor() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<64x32xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<32x128xf16>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<64x32xf16>> -> tensor<64x32xf16>
- %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x128xf16>> -> tensor<32x128xf16>
- %15 = tensor.empty() : tensor<64x128xf16>
- %16 = linalg.fill ins(%cst : f16) outs(%15 : tensor<64x128xf16>) -> tensor<64x128xf16>
- // expected-error @+1 {{expected workgroup y component equals to (wg_tile_m / subgroup_tile_m)}}
- %17 = linalg.matmul {compilation_info = #compilation}
- ins(%8, %10 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%16 : tensor<64x128xf16>) -> tensor<64x128xf16>
- flow.dispatch.tensor.store %17, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1]
- : tensor<64x128xf16> -> !flow.dispatch.tensor<writeonly:tensor<64x128xf16>>
- return
- }
- }
- }
-}
-
-// -----
-
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[0, 4, 4, 16], [0, 2, 2, 2], [0, 0, 0, 0, 1, 1, 4]]>,
- translation_info = <SPIRVBaseVectorize>,
- workgroup_size = [8, 2, 2]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @conv_2d_nhwc_hwcf {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @illegal layout(#pipeline_layout)
- builtin.module {
- func.func @illegal() {
- %c112 = arith.constant 112 : index
- %c16 = arith.constant 16 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x225x225x8xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3x3x8x16xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %workgroup_id_z = hal.interface.workgroup.id[2] : index
- %workgroup_count_z = hal.interface.workgroup.count[2] : index
- %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
- %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_z]
- scf.for %arg0 = %3 to %c112 step %4 {
- %5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
- %6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
- scf.for %arg1 = %5 to %c112 step %6 {
- %7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
- %8 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_count_x]
- scf.for %arg2 = %7 to %c16 step %8 {
- %9 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>> -> tensor<1x4x4x16xf32>
- %10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
- %11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
- %12 = flow.dispatch.tensor.load %0, offsets = [0, %10, %11, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x8xf32>> -> tensor<1x9x9x8xf32>
- %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x8x16xf32>> -> tensor<3x3x8x16xf32>
- %14 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
- // expected-error @+1 {{expected 4 levels of tiling sizes, got 3}}
- %15 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, compilation_info = #compilation, strides = dense<2> : tensor<2xi64>}
- ins(%12, %13 : tensor<1x9x9x8xf32>, tensor<3x3x8x16xf32>)
- outs(%14 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
- flow.dispatch.tensor.store %15, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : tensor<1x4x4x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[0, 4, 4, 16], [0, 2, 2, 2], [0, 0, 0, 0, 1, 1, 4]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#map = affine_map<()[s0] -> (s0 * 4)>
+#map1 = affine_map<()[s0] -> (s0 * 16)>
+#map2 = affine_map<(d0) -> (d0 * 2)>
+#translation = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 2, 2]>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c112 = arith.constant 112 : index
+ %c16 = arith.constant 16 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x225x225x8xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3x3x8x16xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %workgroup_id_z = hal.interface.workgroup.id[2] : index
+ %workgroup_count_z = hal.interface.workgroup.count[2] : index
+ %3 = affine.apply #map()[%workgroup_id_z]
+ %4 = affine.apply #map()[%workgroup_count_z]
+ scf.for %arg0 = %3 to %c112 step %4 {
+ %5 = affine.apply #map()[%workgroup_id_y]
+ %6 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg1 = %5 to %c112 step %6 {
+ %7 = affine.apply #map1()[%workgroup_id_x]
+ %8 = affine.apply #map1()[%workgroup_count_x]
+ scf.for %arg2 = %7 to %c16 step %8 {
+ %9 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>> -> tensor<1x4x4x16xf32>
+ %10 = affine.apply #map2(%arg0)
+ %11 = affine.apply #map2(%arg1)
+ %12 = flow.dispatch.tensor.load %0, offsets = [0, %10, %11, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x8xf32>> -> tensor<1x9x9x8xf32>
+ %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x8x16xf32>> -> tensor<3x3x8x16xf32>
+ %14 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
+ // expected-error @+1 {{expected 4 levels of tiling sizes, got 3}}
+ %15 = linalg.conv_2d_nhwc_hwcf {compilation_info = #compilation, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%12, %13 : tensor<1x9x9x8xf32>, tensor<3x3x8x16xf32>) outs(%14 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
+ flow.dispatch.tensor.store %15, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : tensor<1x4x4x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
}
- return
}
}
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[0, 6, 6, 16], [0, 3, 3, 2], [0, 0, 0, 0, 1, 1, 4], [0, 1, 0, 0]]>,
- translation_info = <SPIRVBaseVectorize>,
- workgroup_size = [8, 2, 2]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @conv_2d_nhwc_hwcf {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @illegal layout(#pipeline_layout)
- builtin.module {
- func.func @illegal() {
- %c112 = arith.constant 112 : index
- %c16 = arith.constant 16 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x225x225x8xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3x3x8x16xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %workgroup_id_z = hal.interface.workgroup.id[2] : index
- %workgroup_count_z = hal.interface.workgroup.count[2] : index
- %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
- %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_z]
- scf.for %arg0 = %3 to %c112 step %4 {
- %5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
- %6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
- scf.for %arg1 = %5 to %c112 step %6 {
- %7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
- %8 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_count_x]
- scf.for %arg2 = %7 to %c16 step %8 {
- %9 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>> -> tensor<1x4x4x16xf32>
- %10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
- %11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
- %12 = flow.dispatch.tensor.load %0, offsets = [0, %10, %11, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x8xf32>> -> tensor<1x9x9x8xf32>
- %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x8x16xf32>> -> tensor<3x3x8x16xf32>
- %14 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
- // expected-error @+1 {{expected first level tile size divides the output size [OH, OW, OC]}}
- %15 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, compilation_info = #compilation, strides = dense<2> : tensor<2xi64>}
- ins(%12, %13 : tensor<1x9x9x8xf32>, tensor<3x3x8x16xf32>)
- outs(%14 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
- flow.dispatch.tensor.store %15, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : tensor<1x4x4x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[0, 6, 6, 16], [0, 3, 3, 2], [0, 0, 0, 0, 1, 1, 4], [0, 1, 0, 0]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#map = affine_map<()[s0] -> (s0 * 4)>
+#map1 = affine_map<()[s0] -> (s0 * 16)>
+#map2 = affine_map<(d0) -> (d0 * 2)>
+#translation = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 2, 2]>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c112 = arith.constant 112 : index
+ %c16 = arith.constant 16 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x225x225x8xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3x3x8x16xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %workgroup_id_z = hal.interface.workgroup.id[2] : index
+ %workgroup_count_z = hal.interface.workgroup.count[2] : index
+ %3 = affine.apply #map()[%workgroup_id_z]
+ %4 = affine.apply #map()[%workgroup_count_z]
+ scf.for %arg0 = %3 to %c112 step %4 {
+ %5 = affine.apply #map()[%workgroup_id_y]
+ %6 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg1 = %5 to %c112 step %6 {
+ %7 = affine.apply #map1()[%workgroup_id_x]
+ %8 = affine.apply #map1()[%workgroup_count_x]
+ scf.for %arg2 = %7 to %c16 step %8 {
+ %9 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>> -> tensor<1x4x4x16xf32>
+ %10 = affine.apply #map2(%arg0)
+ %11 = affine.apply #map2(%arg1)
+ %12 = flow.dispatch.tensor.load %0, offsets = [0, %10, %11, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x8xf32>> -> tensor<1x9x9x8xf32>
+ %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x8x16xf32>> -> tensor<3x3x8x16xf32>
+ %14 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
+ // expected-error @+1 {{expected first level tile size divides the output size [OH, OW, OC]}}
+ %15 = linalg.conv_2d_nhwc_hwcf {compilation_info = #compilation, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%12, %13 : tensor<1x9x9x8xf32>, tensor<3x3x8x16xf32>) outs(%14 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
+ flow.dispatch.tensor.store %15, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : tensor<1x4x4x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
}
- return
}
}
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[0, 4, 4, 16], [0, 2, 2, 4], [0, 0, 0, 0, 1, 1, 4], [0, 1, 0, 0]]>,
- translation_info = <SPIRVBaseVectorize>,
- workgroup_size = [8, 2, 2]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @conv_2d_nhwc_hwcf {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @illegal layout(#pipeline_layout)
- builtin.module {
- func.func @illegal() {
- %c112 = arith.constant 112 : index
- %c16 = arith.constant 16 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x225x225x8xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3x3x8x16xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %workgroup_id_z = hal.interface.workgroup.id[2] : index
- %workgroup_count_z = hal.interface.workgroup.count[2] : index
- %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_z]
- %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_z]
- scf.for %arg0 = %3 to %c112 step %4 {
- %5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
- %6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
- scf.for %arg1 = %5 to %c112 step %6 {
- %7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_x]
- %8 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_count_x]
- scf.for %arg2 = %7 to %c16 step %8 {
- %9 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>> -> tensor<1x4x4x16xf32>
- %10 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
- %11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
- %12 = flow.dispatch.tensor.load %0, offsets = [0, %10, %11, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x8xf32>> -> tensor<1x9x9x8xf32>
- %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x8x16xf32>> -> tensor<3x3x8x16xf32>
- %14 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
- // expected-error @+1 {{expected workgroup tile sizes to be the product of thread tile size and workgroup size}}
- %15 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, compilation_info = #compilation, strides = dense<2> : tensor<2xi64>}
- ins(%12, %13 : tensor<1x9x9x8xf32>, tensor<3x3x8x16xf32>)
- outs(%14 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
- flow.dispatch.tensor.store %15, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : tensor<1x4x4x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[0, 4, 4, 16], [0, 2, 2, 4], [0, 0, 0, 0, 1, 1, 4], [0, 1, 0, 0]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#map = affine_map<()[s0] -> (s0 * 4)>
+#map1 = affine_map<()[s0] -> (s0 * 16)>
+#map2 = affine_map<(d0) -> (d0 * 2)>
+#translation = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [8, 2, 2]>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c112 = arith.constant 112 : index
+ %c16 = arith.constant 16 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1x225x225x8xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3x3x8x16xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %workgroup_id_z = hal.interface.workgroup.id[2] : index
+ %workgroup_count_z = hal.interface.workgroup.count[2] : index
+ %3 = affine.apply #map()[%workgroup_id_z]
+ %4 = affine.apply #map()[%workgroup_count_z]
+ scf.for %arg0 = %3 to %c112 step %4 {
+ %5 = affine.apply #map()[%workgroup_id_y]
+ %6 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg1 = %5 to %c112 step %6 {
+ %7 = affine.apply #map1()[%workgroup_id_x]
+ %8 = affine.apply #map1()[%workgroup_count_x]
+ scf.for %arg2 = %7 to %c16 step %8 {
+ %9 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>> -> tensor<1x4x4x16xf32>
+ %10 = affine.apply #map2(%arg0)
+ %11 = affine.apply #map2(%arg1)
+ %12 = flow.dispatch.tensor.load %0, offsets = [0, %10, %11, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x225x225x8xf32>> -> tensor<1x9x9x8xf32>
+ %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x8x16xf32>> -> tensor<3x3x8x16xf32>
+ %14 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
+ // expected-error @+1 {{expected workgroup tile sizes to be the product of thread tile size and workgroup size}}
+ %15 = linalg.conv_2d_nhwc_hwcf {compilation_info = #compilation, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%12, %13 : tensor<1x9x9x8xf32>, tensor<3x3x8x16xf32>) outs(%14 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32>
+ flow.dispatch.tensor.store %15, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : tensor<1x4x4x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x112x112x16xf32>>
}
- return
}
}
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[0, 1, 7, 64], [0, 1, 7, 2], [0, 0, 0, 0, 5, 5], [0, 1, 0, 0]]>,
- translation_info = <SPIRVBaseVectorize>,
- workgroup_size = [32, 1, 1]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @depthwise_conv_2d_nhwc_hwc {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @illegal layout(#pipeline_layout)
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1x11x11x576xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<5x5x576xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1x7x7x576xf32>
- // expected-error @+1 {{expected tile sizes for KH and KW to be 1}}
- linalg.depthwise_conv_2d_nhwc_hwc {compilation_info = #compilation}
- ins(%lhs, %rhs : memref<1x11x11x576xf32>, memref<5x5x576xf32>)
- outs(%result: memref<1x7x7x576xf32>)
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 7, 64], [0, 1, 7, 2], [0, 0, 0, 0, 5, 5], [0, 1, 0, 0]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#translation = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [32, 1, 1]>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1x11x11x576xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<5x5x576xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1x7x7x576xf32>
+ // expected-error @+1 {{expected tile sizes for KH and KW to be 1}}
+ linalg.depthwise_conv_2d_nhwc_hwc {compilation_info = #compilation} ins(%0, %1 : memref<1x11x11x576xf32>, memref<5x5x576xf32>) outs(%2 : memref<1x7x7x576xf32>)
+ return
}
}
// -----
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[0, 1, 7, 64], [0, 1, 7, 2], [0, 0, 0, 0, 1, 1], [0, 0, 1, 1]]>,
- translation_info = <SPIRVBaseVectorize>,
- workgroup_size = [32, 1, 1]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @depthwise_conv_2d_nhwc_hwc {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 16384,
- max_compute_workgroup_invocations = 128,
- max_compute_workgroup_size = [128, 128, 64],
- subgroup_size = 32>>
- }>) {
- hal.executable.export @illegal layout(#pipeline_layout)
- builtin.module {
- func.func @illegal() {
- %c0 = arith.constant 0 : index
- %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1x11x11x576xf32>
- %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<5x5x576xf32>
- %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1x7x7x576xf32>
- // expected-error @+1 {{expected the fourth level of tile size to be [0, 1, 0, 0]}}
- linalg.depthwise_conv_2d_nhwc_hwc {compilation_info = #compilation}
- ins(%lhs, %rhs : memref<1x11x11x576xf32>, memref<5x5x576xf32>)
- outs(%result: memref<1x7x7x576xf32>)
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 7, 64], [0, 1, 7, 2], [0, 0, 0, 0, 1, 1], [0, 0, 1, 1]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader], []>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#translation = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [32, 1, 1]>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1x11x11x576xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<5x5x576xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1x7x7x576xf32>
+ // expected-error @+1 {{expected the fourth level of tile size to be [0, 1, 0, 0]}}
+ linalg.depthwise_conv_2d_nhwc_hwc {compilation_info = #compilation} ins(%0, %1 : memref<1x11x11x576xf32>, memref<5x5x576xf32>) outs(%2 : memref<1x7x7x576xf32>)
+ return
}
}
+
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_fusion.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_fusion.mlir
index 0a88a26..756ba3a 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_fusion.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_fusion.mlir
@@ -1,94 +1,64 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-spirv-configuration-pipeline, iree-spirv-lower-executable-target-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-codegen-spirv-configuration-pipeline, func.func(iree-spirv-lower-executable-target-pass))' %s | FileCheck %s
-#compilation = #iree_codegen.compilation_info<
- lowering_config = <tile_sizes = [[32, 128, 1, 32]]>,
- translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 1, store_stage = 1}>,
- workgroup_size = [32, 8, 1]>
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
-
-hal.executable @matmul_i4_quant_weight {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.5, [Shader], []>, AMD:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [65535, 65535, 65535],
- subgroup_size = 32>>}>) {
- hal.executable.export public @matmul_i4_quant_weight ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @matmul_i4_quant_weight() {
- %c32 = arith.constant 32 : index
- %c128 = arith.constant 128 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<86x128x2048xi4>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<86x2048xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<86x2048xi4>>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xf32>>
- %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096x2048xf32>>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
- %6 = flow.dispatch.tensor.load %3, offsets = [%5, 0, 0], sizes = [%c32, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xf32>> -> tensor<?x86x128xf32>
- %7 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
- %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, %7], sizes = [86, 128, %c128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<86x128x2048xi4>> -> tensor<86x128x?xi4>
- %9 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
- %10 = flow.dispatch.tensor.load %1, offsets = [0, %9], sizes = [86, %c128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<86x2048xf32>> -> tensor<86x?xf32>
- %11 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
- %12 = flow.dispatch.tensor.load %2, offsets = [0, %11], sizes = [86, %c128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<86x2048xi4>> -> tensor<86x?xi4>
- %13 = tensor.empty() : tensor<86x128x128xf32>
- %cast = tensor.cast %8 : tensor<86x128x?xi4> to tensor<86x128x128xi4>
- %cast_0 = tensor.cast %10 : tensor<86x?xf32> to tensor<86x128xf32>
- %cast_1 = tensor.cast %12 : tensor<86x?xi4> to tensor<86x128xi4>
- %14 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d2)>,
- affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]
- } ins(%cast, %cast_0, %cast_1 : tensor<86x128x128xi4>, tensor<86x128xf32>, tensor<86x128xi4>) outs(%13 : tensor<86x128x128xf32>) {
- ^bb0(%in: i4, %in_4: f32, %in_5: i4, %out: f32):
- %20 = arith.extsi %in : i4 to i32
- %21 = arith.extsi %in_5 : i4 to i32
- %22 = arith.subi %20, %21 : i32
- %23 = arith.sitofp %22 : i32 to f32
- %24 = arith.mulf %23, %in_4 : f32
- linalg.yield %24 : f32
- } -> tensor<86x128x128xf32>
- %15 = tensor.empty() : tensor<32x128xf32>
- %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x128xf32>) -> tensor<32x128xf32>
- %cast_2 = tensor.cast %6 : tensor<?x86x128xf32> to tensor<32x86x128xf32>
- %17 = linalg.generic {
- indexing_maps = [
- affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>,
- affine_map<(d0, d1, d2, d3) -> (d2, d3, d1)>,
- affine_map<(d0, d1, d2, d3) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel", "reduction", "reduction"]
- } ins(%cast_2, %14 : tensor<32x86x128xf32>, tensor<86x128x128xf32>) outs(%16 : tensor<32x128xf32>) attrs = {compilation_info = #compilation} {
- ^bb0(%in: f32, %in_4: f32, %out: f32):
- %20 = arith.mulf %in, %in_4 : f32
- %21 = arith.addf %out, %20 : f32
- linalg.yield %21 : f32
- } -> tensor<32x128xf32>
- %cast_3 = tensor.cast %17 : tensor<32x128xf32> to tensor<?x?xf32>
- %18 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
- %19 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
- flow.dispatch.tensor.store %cast_3, %4, offsets = [%18, %19], sizes = [%c32, %c128], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096x2048xf32>>
- return
- }
- }
+#config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 1, 32]]>
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.5, [Shader], []>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [65535, 65535, 65535]>>}>
+#map = affine_map<()[s0] -> (s0 * 32)>
+#map1 = affine_map<()[s0] -> (s0 * 128)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
+#map5 = affine_map<(d0, d1, d2, d3) -> (d2, d3, d1)>
+#map6 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [32, 8, 1], {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
+#compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation>
+module {
+ func.func @matmul_i4_quant_weight() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c32 = arith.constant 32 : index
+ %c128 = arith.constant 128 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<86x128x2048xi4>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<86x2048xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<86x2048xi4>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xf32>>
+ %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096x2048xf32>>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %5 = affine.apply #map()[%workgroup_id_y]
+ %6 = flow.dispatch.tensor.load %3, offsets = [%5, 0, 0], sizes = [%c32, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xf32>> -> tensor<?x86x128xf32>
+ %7 = affine.apply #map1()[%workgroup_id_x]
+ %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, %7], sizes = [86, 128, %c128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<86x128x2048xi4>> -> tensor<86x128x?xi4>
+ %9 = affine.apply #map1()[%workgroup_id_x]
+ %10 = flow.dispatch.tensor.load %1, offsets = [0, %9], sizes = [86, %c128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<86x2048xf32>> -> tensor<86x?xf32>
+ %11 = affine.apply #map1()[%workgroup_id_x]
+ %12 = flow.dispatch.tensor.load %2, offsets = [0, %11], sizes = [86, %c128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<86x2048xi4>> -> tensor<86x?xi4>
+ %13 = tensor.empty() : tensor<86x128x128xf32>
+ %cast = tensor.cast %8 : tensor<86x128x?xi4> to tensor<86x128x128xi4>
+ %cast_0 = tensor.cast %10 : tensor<86x?xf32> to tensor<86x128xf32>
+ %cast_1 = tensor.cast %12 : tensor<86x?xi4> to tensor<86x128xi4>
+ %14 = linalg.generic {indexing_maps = [#map2, #map3, #map3, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cast, %cast_0, %cast_1 : tensor<86x128x128xi4>, tensor<86x128xf32>, tensor<86x128xi4>) outs(%13 : tensor<86x128x128xf32>) {
+ ^bb0(%in: i4, %in_4: f32, %in_5: i4, %out: f32):
+ %20 = arith.extsi %in : i4 to i32
+ %21 = arith.extsi %in_5 : i4 to i32
+ %22 = arith.subi %20, %21 : i32
+ %23 = arith.sitofp %22 : i32 to f32
+ %24 = arith.mulf %23, %in_4 : f32
+ linalg.yield %24 : f32
+ } -> tensor<86x128x128xf32>
+ %15 = tensor.empty() : tensor<32x128xf32>
+ %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x128xf32>) -> tensor<32x128xf32>
+ %cast_2 = tensor.cast %6 : tensor<?x86x128xf32> to tensor<32x86x128xf32>
+ %17 = linalg.generic {indexing_maps = [#map4, #map5, #map6], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast_2, %14 : tensor<32x86x128xf32>, tensor<86x128x128xf32>) outs(%16 : tensor<32x128xf32>) attrs = {compilation_info = #compilation} {
+ ^bb0(%in: f32, %in_4: f32, %out: f32):
+ %20 = arith.mulf %in, %in_4 : f32
+ %21 = arith.addf %out, %20 : f32
+ linalg.yield %21 : f32
+ } -> tensor<32x128xf32>
+ %cast_3 = tensor.cast %17 : tensor<32x128xf32> to tensor<?x?xf32>
+ %18 = affine.apply #map()[%workgroup_id_y]
+ %19 = affine.apply #map1()[%workgroup_id_x]
+ flow.dispatch.tensor.store %cast_3, %4, offsets = [%18, %19], sizes = [%c32, %c128], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096x2048xf32>>
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_promotion.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_promotion.mlir
index 0b0fb7d..2bb5062 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_promotion.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_promotion.mlir
@@ -1,11 +1,14 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-spirv-configuration-pipeline, iree-spirv-lower-executable-target-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-spirv-configuration-pipeline, func.func(iree-spirv-lower-executable-target-pass)))))' %s | FileCheck %s
+
+// TODO (MaheshRavishankar): This test should be modified to run just on the inner module/func.func. Blocked
+// today since `TileAndDistributeToWorkgroups` runs the `FoldAffineMinOverWorkgroupIds` pattern that
+// doesnt work without the entry point.
// Verify pipelining + multi-buffering.
#compilation = #iree_codegen.compilation_info<
lowering_config = <tile_sizes = [[64, 64, 16]]>,
- translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 2, store_stage = 1}>,
- workgroup_size = [16, 8, 1]>
+ translation_info = <SPIRVMatmulPromoteVectorize workgroup_size = [16, 8, 1], {pipeline_depth = 2, store_stage = 1}>>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
#hal.descriptor_set.binding<0, storage_buffer>,
@@ -57,11 +60,9 @@
}
// CHECK-DAG: #[[MAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) mod 2)>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize
-// CHECK: hal.executable.export public @matmul_f32_128x256x64
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [16 : index, 8 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [16, 8, 1]
// CHECK: func.func @matmul_f32_128x256x64()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: %[[CST0:.+]] = arith.constant 0.000000e+00 : f32
// CHECK: memref.alloc() : memref<2x64x20xf32, #gpu.address_space<workgroup>>
// CHECK: memref.alloc() : memref<2x16x68xf32, #gpu.address_space<workgroup>>
@@ -149,11 +150,9 @@
}
// CHECK-DAG: #[[MAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) mod 3)>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize
-// CHECK: hal.executable.export public @matmul_f32_128x256x64
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK-SAME: workgroup_size = [16 : index, 8 : index, 1 : index]
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [16, 8, 1]
// CHECK: func.func @matmul_f32_128x256x64()
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: %[[CST0:.+]] = arith.constant 0.000000e+00 : f32
// CHECK: memref.alloc() : memref<3x64x20xf32, #gpu.address_space<workgroup>>
// CHECK: memref.alloc() : memref<3x16x68xf32, #gpu.address_space<workgroup>>
@@ -206,8 +205,7 @@
#compilation = #iree_codegen.compilation_info<
lowering_config = <tile_sizes = [[64, 256, 32]]>,
- translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 1, store_stage = 1}>,
- workgroup_size = [32, 8, 1]>
+ translation_info = <SPIRVMatmulPromoteVectorize workgroup_size = [32, 8, 1], {pipeline_depth = 1, store_stage = 1}>>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
#hal.descriptor_set.binding<0, storage_buffer>,
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir
index a84e430..3a4a0ed 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir
@@ -1,60 +1,41 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass, iree-spirv-lower-executable-target-pass)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass, func.func(iree-spirv-lower-executable-target-pass))' %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-hal.executable @i4_dequant_matvec_f32 {
- hal.executable.variant @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 32768,
- max_compute_workgroup_invocations = 512,
- max_compute_workgroup_size = [512, 512, 512],
- subgroup_size = 64>>
- }>) {
- hal.executable.export @i4_dequant_matvec_f32 layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @i4_dequant_matvec_f32() {
- %cst = arith.constant 0.000000e+00 : f32
- %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
- %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
- %12 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
- %13 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<86x128xf32>>
- %14 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
- %15 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
- %16 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
- %17 = flow.dispatch.tensor.load %12, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
- %18 = flow.dispatch.tensor.load %13, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<86x128xf32>> -> tensor<86x128xf32>
- %19 = tensor.empty() : tensor<4096xf32>
- %20 = tensor.empty() : tensor<4096x86x128xf32>
- %21 = linalg.fill ins(%cst : f32) outs(%19 : tensor<4096xf32>) -> tensor<4096xf32>
- %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%15, %16, %17 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%20 : tensor<4096x86x128xf32>) {
- ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
- %24 = arith.extui %in : i4 to i32
- %25 = arith.uitofp %24 : i32 to f32
- %26 = arith.subf %25, %in_1 : f32
- %27 = arith.mulf %26, %in_0 : f32
- linalg.yield %27 : f32
- } -> tensor<4096x86x128xf32>
- %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%18, %22 : tensor<86x128xf32>, tensor<4096x86x128xf32>) outs(%21 : tensor<4096xf32>) {
- ^bb0(%in: f32, %in_0: f32, %out: f32):
- %24 = arith.mulf %in, %in_0 : f32
- %25 = arith.addf %24, %out : f32
- linalg.yield %25 : f32
- } -> tensor<4096xf32>
- flow.dispatch.tensor.store %23, %14, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, #spirv.resource_limits<max_compute_shared_memory_size = 32768, max_compute_workgroup_invocations = 512, max_compute_workgroup_size = [512, 512, 512], subgroup_size = 64>>}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0)>
+module {
+ func.func @i4_dequant_matvec_f32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<86x128xf32>>
+ %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
+ %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
+ %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
+ %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
+ %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<86x128xf32>> -> tensor<86x128xf32>
+ %9 = tensor.empty() : tensor<4096xf32>
+ %10 = tensor.empty() : tensor<4096x86x128xf32>
+ %11 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4096xf32>) -> tensor<4096xf32>
+ %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%10 : tensor<4096x86x128xf32>) {
+ ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
+ %14 = arith.extui %in : i4 to i32
+ %15 = arith.uitofp %14 : i32 to f32
+ %16 = arith.subf %15, %in_1 : f32
+ %17 = arith.mulf %16, %in_0 : f32
+ linalg.yield %17 : f32
+ } -> tensor<4096x86x128xf32>
+ %13 = linalg.generic {indexing_maps = [#map2, #map, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<86x128xf32>, tensor<4096x86x128xf32>) outs(%11 : tensor<4096xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %14 = arith.mulf %in, %in_0 : f32
+ %15 = arith.addf %14, %out : f32
+ linalg.yield %15 : f32
+ } -> tensor<4096xf32>
+ flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_reduction.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_reduction.mlir
index 4a8dbf3..69f2583 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_reduction.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_reduction.mlir
@@ -1,44 +1,27 @@
// RUN: iree-opt --split-input-file \
-// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-decompose-softmax)), iree-spirv-select-lowering-strategy-pass, iree-spirv-lower-executable-target-pass)))' \
+// RUN: --pass-pipeline='builtin.module(func.func(iree-codegen-decompose-softmax), iree-spirv-select-lowering-strategy-pass, func.func(iree-spirv-lower-executable-target-pass))' \
// RUN: %s | FileCheck %s
-#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.3,
- [Shader, GroupNonUniform, GroupNonUniformShuffle], [SPV_KHR_storage_buffer_storage_class]>, Unknown:Unknown,
- #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 32, cooperative_matrix_properties_khr = []>>}>
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
-
-hal.executable @warp_reduction_dispatch {
- hal.executable.variant public @vulkan_spirv_fb target(#executable_target_vulkan_spirv_fb) {
- hal.executable.export public @warp_reduction_dispatch ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @warp_reduction_dispatch() {
- %c0 = arith.constant 0 : index
- %c10240 = arith.constant 10240 : index
- %cst = arith.constant 1.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x10240xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<512xf32>>
- %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 10240], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<512x10240xf32>> -> tensor<512x10240xf32>
- %8 = tensor.empty() : tensor<512xf32>
- %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<512xf32>) -> tensor<512xf32>
- %10 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]}
- ins(%5 : tensor<512x10240xf32>) outs(%9 : tensor<512xf32>) {
- ^bb0(%arg1: f32, %arg2: f32): // no predecessors
- %11 = arith.addf %arg1, %arg2 : f32
- linalg.yield %11 : f32
- } -> tensor<512xf32>
- flow.dispatch.tensor.store %10, %1, offsets = [0], sizes = [512], strides = [1]
- : tensor<512xf32> -> !flow.dispatch.tensor<writeonly:tensor<512xf32>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform, GroupNonUniformShuffle], [SPV_KHR_storage_buffer_storage_class]>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], cooperative_matrix_properties_khr = []>>}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+module {
+ func.func @warp_reduction_dispatch() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %c10240 = arith.constant 10240 : index
+ %cst = arith.constant 1.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x10240xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<512xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 10240], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x10240xf32>> -> tensor<512x10240xf32>
+ %3 = tensor.empty() : tensor<512xf32>
+ %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<512xf32>) -> tensor<512xf32>
+ %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<512x10240xf32>) outs(%4 : tensor<512xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %6 = arith.addf %in, %out : f32
+ linalg.yield %6 : f32
+ } -> tensor<512xf32>
+ flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [512], strides = [1] : tensor<512xf32> -> !flow.dispatch.tensor<writeonly:tensor<512xf32>>
+ return
}
}
@@ -98,50 +81,31 @@
// -----
-#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.3,
- [Shader, GroupNonUniform, GroupNonUniformShuffle], [SPV_KHR_storage_buffer_storage_class]>, Unknown:Unknown,
- #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 32>>}>
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
-
-hal.executable @warp_reduction_dispatch {
- hal.executable.variant public @vulkan_spirv_fb target(#executable_target_vulkan_spirv_fb) {
- hal.executable.export public @warp_reduction_dispatch ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @warp_reduction_dispatch() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<10x9216x9216xf16>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<10x9216x9216xf16>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x9216x9216xf16>> -> tensor<10x9216x9216xf16>
- %3 = tensor.empty() : tensor<10x9216x9216xf16>
- %4 = tensor.empty() : tensor<10x9216xf16>
- %5 = linalg.fill ins(%cst : f16) outs(%4 : tensor<10x9216xf16>) -> tensor<10x9216xf16>
- %6 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel", "reduction"]}
- ins(%2 : tensor<10x9216x9216xf16>) outs(%5 : tensor<10x9216xf16>) {
- ^bb0(%in: f16, %out: f16):
- %8 = arith.addf %in, %out : f16
- linalg.yield %8 : f16
- } -> tensor<10x9216xf16>
- %7 = linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%2, %6 : tensor<10x9216x9216xf16>, tensor<10x9216xf16>) outs(%3 : tensor<10x9216x9216xf16>) {
- ^bb0(%in: f16, %in_0: f16, %out: f16):
- %8 = arith.divf %in, %in_0 : f16
- linalg.yield %8 : f16
- } -> tensor<10x9216x9216xf16>
- flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : tensor<10x9216x9216xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x9216x9216xf16>>
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform, GroupNonUniformShuffle], [SPV_KHR_storage_buffer_storage_class]>, #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64]>>}>
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module {
+ func.func @warp_reduction_dispatch() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<10x9216x9216xf16>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<10x9216x9216xf16>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x9216x9216xf16>> -> tensor<10x9216x9216xf16>
+ %3 = tensor.empty() : tensor<10x9216x9216xf16>
+ %4 = tensor.empty() : tensor<10x9216xf16>
+ %5 = linalg.fill ins(%cst : f16) outs(%4 : tensor<10x9216xf16>) -> tensor<10x9216xf16>
+ %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<10x9216x9216xf16>) outs(%5 : tensor<10x9216xf16>) {
+ ^bb0(%in: f16, %out: f16):
+ %8 = arith.addf %in, %out : f16
+ linalg.yield %8 : f16
+ } -> tensor<10x9216xf16>
+ %7 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2, %6 : tensor<10x9216x9216xf16>, tensor<10x9216xf16>) outs(%3 : tensor<10x9216x9216xf16>) {
+ ^bb0(%in: f16, %in_0: f16, %out: f16):
+ %8 = arith.divf %in, %in_0 : f16
+ linalg.yield %8 : f16
+ } -> tensor<10x9216x9216xf16>
+ flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : tensor<10x9216x9216xf16> -> !flow.dispatch.tensor<writeonly:tensor<10x9216x9216xf16>>
+ return
}
}
@@ -191,40 +155,22 @@
// -----
-#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.3,
- [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, Unknown:Unknown, #spirv.resource_limits<
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 64],
- subgroup_size = 32>>}>
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
-
-hal.executable @softmax {
-hal.executable.variant public @vulkan_spirv_fb target(#executable_target_vulkan_spirv_fb) {
- hal.executable.export public @softmax ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @softmax() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant -3.40282347E+38 : f32
- %cst_0 = arith.constant 0.000000e+00 : f32
- %cst_1 = arith.constant 1.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>> -> tensor<12x128x40960xf32>
- %3 = tensor.empty() : tensor<12x128x40960xf32>
- %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32>
- flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
- return
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64]>>}>
+module {
+ func.func @softmax() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant -3.40282347E+38 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %cst_1 = arith.constant 1.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<12x128x40960xf32>> -> tensor<12x128x40960xf32>
+ %3 = tensor.empty() : tensor<12x128x40960xf32>
+ %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32>
+ flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor<writeonly:tensor<12x128x40960xf32>>
+ return
}
}
-}
// CHECK-LABEL: func.func @softmax
// CHECK: scf.for {{.*}} -> (vector<4xf32>) {
@@ -307,49 +253,25 @@
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 2, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer, ReadOnly>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-
-hal.executable private @dynamic_softmax {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, GroupNonUniformShuffle],
- [SPV_KHR_16bit_storage]>, api=Vulkan, Unknown:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>
- }>) {
- hal.executable.export public @dynamic_softmax ordinal(0) layout(#pipeline_layout) {
- ^bb0(%arg0: !hal.device, %arg1: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @dynamic_softmax() {
- %c32_i64 = arith.constant 32 : i64
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = arith.extui %0 : i32 to i64
- %3 = arith.extui %1 : i32 to i64
- %4 = arith.shli %3, %c32_i64 : i64
- %5 = arith.ori %2, %4 : i64
- %6 = arith.index_castui %5 : i64 to index
- %7 = flow.dispatch.workload.ordinal %6, 0 : index
- %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x?xf16>>{%7}
- %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x?xf16>>{%7}
- %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x?xf16>>{%7} -> tensor<32x?xf16>
- %11 = tensor.empty(%7) : tensor<32x?xf16>
- %12 = linalg.softmax dimension(1) ins(%10 : tensor<32x?xf16>) outs(%11 : tensor<32x?xf16>) -> tensor<32x?xf16>
- flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : tensor<32x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<32x?xf16>>{%7}
- return
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, GroupNonUniformShuffle], [SPV_KHR_16bit_storage]>, api=Vulkan, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+module {
+ func.func @dynamic_softmax() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c32_i64 = arith.constant 32 : i64
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.extui %0 : i32 to i64
+ %3 = arith.extui %1 : i32 to i64
+ %4 = arith.shli %3, %c32_i64 : i64
+ %5 = arith.ori %2, %4 : i64
+ %6 = arith.index_castui %5 : i64 to index
+ %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32x?xf16>>{%6}
+ %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x?xf16>>{%6}
+ %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [32, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32x?xf16>>{%6} -> tensor<32x?xf16>
+ %11 = tensor.empty(%6) : tensor<32x?xf16>
+ %12 = linalg.softmax dimension(1) ins(%10 : tensor<32x?xf16>) outs(%11 : tensor<32x?xf16>) -> tensor<32x?xf16>
+ flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [32, %6], strides = [1, 1] : tensor<32x?xf16> -> !flow.dispatch.tensor<writeonly:tensor<32x?xf16>>{%6}
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_scalar_dispatch.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_scalar_dispatch.mlir
index 539e2d4..e3de27b 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_scalar_dispatch.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_scalar_dispatch.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass, iree-spirv-lower-executable-target-pass)))' -mlir-print-local-scope %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-spirv-select-lowering-strategy-pass, func.func(iree-spirv-lower-executable-target-pass)))))' -mlir-print-local-scope %s | FileCheck %s
#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {
spirv.target_env = #spirv.target_env<#spirv.vce<v1.5, [Shader], []>, Unknown:Unknown,
@@ -32,11 +32,8 @@
}
}
-// CHECK-LABEL: hal.executable.export public @scalar_dispatch
-// CHECK-SAME: translation_info = #iree_codegen.translation_info<SPIRVBaseLowering>
-// CHECK-SAME: workgroup_size = [1 : index, 1 : index, 1 : index]
-
// CHECK: func.func @scalar_dispatch()
+// CHECK-SAME: translation_info = #iree_codegen.translation_info<SPIRVBaseLowering workgroup_size = [1, 1, 1]>
// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan set(0) binding(0)
// CHECK: %[[SPAN1:.+]] = hal.interface.binding.subspan set(0) binding(1)
// CHECK: memref.load %[[SPAN0]][] : memref<i64, #hal.descriptor_type<storage_buffer>>
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
index dcc9406..deaaef5 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
@@ -1,5 +1,5 @@
// RUN: iree-opt --split-input-file \
-// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-spirv-configuration-pipeline, iree-codegen-linalg-to-spirv-pipeline, canonicalize, cse)))' \
+// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-spirv-configuration-pipeline), iree-codegen-linalg-to-spirv-pipeline, canonicalize, cse)))' \
// RUN: %s | FileCheck %s
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
@@ -866,8 +866,7 @@
#compilation = #iree_codegen.compilation_info<
lowering_config = <tile_sizes = [[1, 64, 64], [1, 16, 64], [0, 0, 0, 16], [1, 16, 16, 16]]>,
- translation_info = <SPIRVCooperativeMatrixVectorize, {pipeline_depth = 1, store_stage = 1}>,
- workgroup_size = [32, 4, 1], subgroup_size = 32>
+ translation_info = <SPIRVCooperativeMatrixVectorize workgroup_size = [32, 4, 1] subgroup_size = 32, {pipeline_depth = 1, store_stage = 1}>>
hal.executable public @batch_matmul_f16_16x4096x4096x64_truncf_mulf {
hal.executable.variant @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_promotion.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_promotion.mlir
index d4e11c3..bdbad94 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_promotion.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_promotion.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-spirv-configuration-pipeline, iree-codegen-linalg-to-spirv-pipeline)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-spirv-configuration-pipeline), iree-codegen-linalg-to-spirv-pipeline)))' %s | FileCheck %s
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
@@ -174,8 +174,7 @@
#user_config = #iree_codegen.compilation_info<
lowering_config = <tile_sizes = [[16, 128, 16]]>,
- translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
- workgroup_size = [16, 8, 1]>
+ translation_info = <SPIRVMatmulPromoteVectorize workgroup_size = [16, 8, 1], {pipeline_depth = 0, store_stage = 1}>>
hal.executable @matmul_f16_32x1280x1280 {
hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir
index a3ed5b8..5b3eea9 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-spirv-configuration-pipeline, iree-codegen-linalg-to-spirv-pipeline)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-spirv-configuration-pipeline), iree-codegen-linalg-to-spirv-pipeline)))' %s | FileCheck %s
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matvec.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matvec.mlir
index 97e1244..1046bc7 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matvec.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matvec.mlir
@@ -1,5 +1,5 @@
// RUN: iree-opt --split-input-file \
-// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-spirv-configuration-pipeline, iree-codegen-linalg-to-spirv-pipeline)))' \
+// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-spirv-configuration-pipeline), iree-codegen-linalg-to-spirv-pipeline)))' \
// RUN: %s | FileCheck %s
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_reduction_subgroup.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_reduction_subgroup.mlir
index 4407edd..a402e2d 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_reduction_subgroup.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_reduction_subgroup.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-spirv-configuration-pipeline, iree-codegen-linalg-to-spirv-pipeline)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-spirv-configuration-pipeline), iree-codegen-linalg-to-spirv-pipeline)))' %s | FileCheck %s
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_sub_byte_dequant.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_sub_byte_dequant.mlir
index 89a39fb..e2ede88 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_sub_byte_dequant.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_sub_byte_dequant.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-spirv-configuration-pipeline, iree-codegen-linalg-to-spirv-pipeline)))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-spirv-configuration-pipeline), iree-codegen-linalg-to-spirv-pipeline)))' %s | FileCheck %s
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir
index 1aa48d0..21bc445 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir
@@ -1,46 +1,23 @@
// RUN: iree-opt %s --split-input-file \
-// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-spirv-select-lowering-strategy-pass)))"\
+// RUN: --pass-pipeline="builtin.module(iree-spirv-select-lowering-strategy-pass)"\
// RUN: --iree-spirv-enable-transform-dialect-jit=true | FileCheck %s
-hal.executable @matmul {
-hal.executable.variant public @vulkan target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f32, b_type = f32, c_type = f32, k_size = 8,
- m_size = 16, n_size = 16, result_type = f32, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [2147483647, 65535, 65535],
- subgroup_size = 32>
- >}>) {
- hal.executable.export public @matmul ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [2147483647, 65535, 65535], cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 8, a_type = f32, b_type = f32, c_type = f32, result_type = f32, acc_sat = false, scope = <Subgroup>>]>>}>
+module {
+ func.func @matmul() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2052x2556xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2052xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2052x2052xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2052x2556xf32>> -> tensor<2052x2556xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2052xf32>> -> tensor<2556x2052xf32>
+ %5 = tensor.empty() : tensor<2052x2052xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf32>, tensor<2556x2052xf32>) outs(%6 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf32> -> !flow.dispatch.tensor<writeonly:tensor<2052x2052xf32>>
+ return
}
- builtin.module {
- func.func @matmul() {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2052x2556xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2556x2052xf32>>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2052x2052xf32>>
- %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2052x2556xf32>> -> tensor<2052x2556xf32>
- %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2556x2052xf32>> -> tensor<2556x2052xf32>
- %5 = tensor.empty() : tensor<2052x2052xf32>
- %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32>
- %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf32>, tensor<2556x2052xf32>) outs(%6 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32>
- flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf32> -> !flow.dispatch.tensor<writeonly:tensor<2052x2052xf32>>
- return
- }
- }
-}
}
// CHECK-LABEL: func @matmul
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_cooperative_matrix.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_cooperative_matrix.mlir
index a5a3190..a35d8aa 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_cooperative_matrix.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_cooperative_matrix.mlir
@@ -1,89 +1,51 @@
// RUN: iree-opt --split-input-file --mlir-print-local-scope \
-// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-spirv-tile-and-promote{promote-c=false skip-thread=true}, cse)))))' \
+// RUN: --pass-pipeline='builtin.module(func.func(iree-spirv-tile-and-promote{promote-c=false skip-thread=true}, cse))' \
// RUN: %s | FileCheck %s
// RUN: iree-opt --split-input-file --mlir-print-local-scope \
-// RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-spirv-tile-and-promote{promote-c=true skip-thread=true}, cse)))))' \
+// RUN: --pass-pipeline='builtin.module(func.func(iree-spirv-tile-and-promote{promote-c=true skip-thread=true}, cse))' \
// RUN: %s | FileCheck %s --check-prefix=PROMOTEC
// Single tile per workgroup means no subview ops for promotion.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
#config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 32], [16, 16, 16], [0, 0, 32]]>
-
-hal.executable @matmul_f16_32x32x32 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = i8, b_type = i8, c_type = i32, k_size = 32,
- m_size = 8, n_size = 8, result_type = i32, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f32, k_size = 16,
- m_size = 16, n_size = 16, result_type = f32, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [2147483647, 65535, 65535],
- subgroup_size = 32>
- >}>) {
- hal.executable.export public @matmul_f16_32x32x32 ordinal(0) layout(#pipeline_layout) attributes {
- translation_info = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize>,
- workgroup_size = [64 : index, 2 : index, 1 : index]
- }
- builtin.module {
- func.func @matmul_f16_32x32x32() {
- %c32 = arith.constant 32 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf16>
- memref.assume_alignment %0, 64 : memref<32x32xf16>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf16>
- memref.assume_alignment %1, 64 : memref<32x32xf16>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf16>
- memref.assume_alignment %2, 64 : memref<32x32xf16>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf16>
- memref.assume_alignment %3, 64 : memref<32x32xf16>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
- %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_y]
- scf.for %arg0 = %4 to %c32 step %5 {
- %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
- %7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
- scf.for %arg1 = %6 to %c32 step %7 {
- linalg.fill ins(%cst : f16) outs(%3 : memref<32x32xf16>)
- linalg.matmul {lowering_config = #config}
- ins(%0, %1 : memref<32x32xf16>, memref<32x32xf16>) outs(%3 : memref<32x32xf16>)
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%2 : memref<32x32xf16>) outs(%3 : memref<32x32xf16>) {
- ^bb0(%in: f16, %out: f16):
- %8 = arith.divf %out, %in : f16
- linalg.yield %8 : f16
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [2147483647, 65535, 65535], cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, acc_sat = false, scope = <Subgroup>>]>>}>
+#map = affine_map<()[s0] -> (s0 * 32)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+#translation = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [64, 2, 1]>
+module {
+ func.func @matmul_f16_32x32x32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb, translation_info = #translation} {
+ %c32 = arith.constant 32 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf16>
+ memref.assume_alignment %0, 64 : memref<32x32xf16>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf16>
+ memref.assume_alignment %1, 64 : memref<32x32xf16>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf16>
+ memref.assume_alignment %2, 64 : memref<32x32xf16>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf16>
+ memref.assume_alignment %3, 64 : memref<32x32xf16>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %4 = affine.apply #map()[%workgroup_id_y]
+ %5 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg0 = %4 to %c32 step %5 {
+ %6 = affine.apply #map()[%workgroup_id_x]
+ %7 = affine.apply #map()[%workgroup_count_x]
+ scf.for %arg1 = %6 to %c32 step %7 {
+ linalg.fill ins(%cst : f16) outs(%3 : memref<32x32xf16>)
+ linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x32xf16>, memref<32x32xf16>) outs(%3 : memref<32x32xf16>)
+ linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<32x32xf16>) outs(%3 : memref<32x32xf16>) {
+ ^bb0(%in: f16, %out: f16):
+ %8 = arith.divf %out, %in : f16
+ linalg.yield %8 : f16
}
- return
}
}
+ return
}
}
@@ -100,112 +62,60 @@
// CHECK-SAME: ins(%[[LHS]], %[[RHS]] : memref<32x32xf16>, memref<32x32xf16>)
-// PROMOTEC-LABEL: func.func @matmul_f16_32x32x32()
-
-// PROMOTEC: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0)
-// PROMOTEC: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1)
-
-// PROMOTEC-NOT: memref.alloc()
-// PROMOTEC-NOT: memref.copy
-
-// PROMOTEC: linalg.matmul
-// PROMOTEC-SAME: __internal_linalg_transform__ = "workgroup_memory"
-// PROMOTEC-SAME: ins(%[[LHS]], %[[RHS]] : memref<32x32xf16>, memref<32x32xf16>)
-
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
#config = #iree_codegen.lowering_config<tile_sizes = [[1, 32, 32, 32], [1, 16, 16, 16], [0, 0, 0, 32]]>
-hal.executable @generic_batch_matmul_f16_32x128x512x64 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = i8, b_type = i8, c_type = i32, k_size = 32,
- m_size = 8, n_size = 8, result_type = i32, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f32, k_size = 16,
- m_size = 16, n_size = 16, result_type = f32, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [2147483647, 65535, 65535],
- subgroup_size = 32>
- >}>) {
- hal.executable.export public @generic_batch_matmul_f16_32x128x512x64 ordinal(0) layout(#pipeline_layout) attributes {
- translation_info = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize>,
- workgroup_size = [64 : index, 2 : index, 1 : index]
- }
- builtin.module {
- func.func @generic_batch_matmul_f16_32x128x512x64() {
- %c32 = arith.constant 32 : index
- %c128 = arith.constant 128 : index
- %c512 = arith.constant 512 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %span0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32x64xf16>
- %span1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64x512xf16>
- %span2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16>
- %span3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %workgroup_id_z = hal.interface.workgroup.id[2] : index
- %workgroup_count_z = hal.interface.workgroup.count[2] : index
- scf.for %arg0 = %workgroup_id_z to %c32 step %workgroup_count_z {
- %3 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
- %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_y]
- scf.for %arg1 = %3 to %c128 step %4 {
- %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
- %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
- scf.for %arg2 = %5 to %c512 step %6 {
- %subview = memref.subview %span2[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>
- %subview_0 = memref.subview %span0[%arg1, %arg0, 0] [32, 1, 64] [1, 1, 1] : memref<128x32x64xf16> to memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>
- %subview_1 = memref.subview %span1[%arg0, 0, %arg2] [1, 64, 32] [1, 1, 1] : memref<32x64x512xf16> to memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>
- linalg.fill ins(%cst : f16) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>)
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
- ins(%subview_0, %subview_1 : memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>, memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>)
- outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>)
- attrs = {lowering_config = #config} {
- ^bb0(%in: f16, %in_2: f16, %out: f16):
- %7 = arith.mulf %in, %in_2 : f16
- %8 = arith.addf %out, %7 : f16
- linalg.yield %8 : f16
- }
- %subview_2 = memref.subview %span3[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%subview_2 : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>)
- outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) {
- ^bb0(%in: f16, %out: f16):
- // spirv.GL.Exp is not permitted to use cooperative matrix types per the spec.
- %8 = math.exp %in : f16
- linalg.yield %8 : f16
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [2147483647, 65535, 65535], cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, acc_sat = false, scope = <Subgroup>>]>>}>
+#map = affine_map<()[s0] -> (s0 * 32)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+#map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [64, 2, 1]>
+module {
+ func.func @generic_batch_matmul_f16_32x128x512x64() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb, translation_info = #translation} {
+ %c32 = arith.constant 32 : index
+ %c128 = arith.constant 128 : index
+ %c512 = arith.constant 512 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32x64xf16>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64x512xf16>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16>
+ %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %workgroup_id_z = hal.interface.workgroup.id[2] : index
+ %workgroup_count_z = hal.interface.workgroup.count[2] : index
+ scf.for %arg0 = %workgroup_id_z to %c32 step %workgroup_count_z {
+ %4 = affine.apply #map()[%workgroup_id_y]
+ %5 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg1 = %4 to %c128 step %5 {
+ %6 = affine.apply #map()[%workgroup_id_x]
+ %7 = affine.apply #map()[%workgroup_count_x]
+ scf.for %arg2 = %6 to %c512 step %7 {
+ %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>
+ %subview_0 = memref.subview %0[%arg1, %arg0, 0] [32, 1, 64] [1, 1, 1] : memref<128x32x64xf16> to memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>
+ %subview_1 = memref.subview %1[%arg0, 0, %arg2] [1, 64, 32] [1, 1, 1] : memref<32x64x512xf16> to memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>
+ linalg.fill ins(%cst : f16) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>)
+ linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%subview_0, %subview_1 : memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>, memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) attrs = {lowering_config = #config} {
+ ^bb0(%in: f16, %in_3: f16, %out: f16):
+ %8 = arith.mulf %in, %in_3 : f16
+ %9 = arith.addf %out, %8 : f16
+ linalg.yield %9 : f16
+ }
+ %subview_2 = memref.subview %3[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>
+ linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%subview_2 : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) {
+ ^bb0(%in: f16, %out: f16):
+ %8 = math.exp %in : f16
+ linalg.yield %8 : f16
}
}
- return
}
}
+ return
}
}
@@ -264,97 +174,58 @@
// Cooperative matrix fusable elementwise ops do not need promote C.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
#config = #iree_codegen.lowering_config<tile_sizes = [[1, 32, 32, 32], [1, 16, 16, 16], [0, 0, 0, 32]]>
-hal.executable @generic_batch_matmul_f16_32x128x512x64 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = i8, b_type = i8, c_type = i32, k_size = 32,
- m_size = 8, n_size = 8, result_type = i32, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f32, k_size = 16,
- m_size = 16, n_size = 16, result_type = f32, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [2147483647, 65535, 65535],
- subgroup_size = 32>
- >}>) {
- hal.executable.export public @generic_batch_matmul_f16_32x128x512x64 ordinal(0) layout(#pipeline_layout) attributes {
- translation_info = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize>,
- workgroup_size = [64 : index, 2 : index, 1 : index]
- }
- builtin.module {
- func.func @generic_batch_matmul_f16_32x128x512x64() {
- %c32 = arith.constant 32 : index
- %c128 = arith.constant 128 : index
- %c512 = arith.constant 512 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %span0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32x64xf16>
- %span1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64x512xf16>
- %span2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16>
- %span3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %workgroup_id_z = hal.interface.workgroup.id[2] : index
- %workgroup_count_z = hal.interface.workgroup.count[2] : index
- scf.for %arg0 = %workgroup_id_z to %c32 step %workgroup_count_z {
- %3 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
- %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_y]
- scf.for %arg1 = %3 to %c128 step %4 {
- %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
- %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
- scf.for %arg2 = %5 to %c512 step %6 {
- %subview = memref.subview %span2[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>
- %subview_0 = memref.subview %span0[%arg1, %arg0, 0] [32, 1, 64] [1, 1, 1] : memref<128x32x64xf16> to memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>
- %subview_1 = memref.subview %span1[%arg0, 0, %arg2] [1, 64, 32] [1, 1, 1] : memref<32x64x512xf16> to memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>
- linalg.fill ins(%cst : f16) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>)
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
- ins(%subview_0, %subview_1 : memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>, memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>)
- outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>)
- attrs = {lowering_config = #config} {
- ^bb0(%in: f16, %in_2: f16, %out: f16):
- %7 = arith.mulf %in, %in_2 : f16
- %8 = arith.addf %out, %7 : f16
- linalg.yield %8 : f16
- }
- %subview_2 = memref.subview %span3[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%subview_2 : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>)
- outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) {
- ^bb0(%in: f16, %out: f16):
- %8 = arith.divf %out, %in : f16
- linalg.yield %8 : f16
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [2147483647, 65535, 65535], cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, acc_sat = false, scope = <Subgroup>>]>>}>
+#map = affine_map<()[s0] -> (s0 * 32)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+#map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [64, 2, 1]>
+module {
+ func.func @generic_batch_matmul_f16_32x128x512x64() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb, translation_info = #translation} {
+ %c32 = arith.constant 32 : index
+ %c128 = arith.constant 128 : index
+ %c512 = arith.constant 512 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32x64xf16>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64x512xf16>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16>
+ %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %workgroup_id_z = hal.interface.workgroup.id[2] : index
+ %workgroup_count_z = hal.interface.workgroup.count[2] : index
+ scf.for %arg0 = %workgroup_id_z to %c32 step %workgroup_count_z {
+ %4 = affine.apply #map()[%workgroup_id_y]
+ %5 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg1 = %4 to %c128 step %5 {
+ %6 = affine.apply #map()[%workgroup_id_x]
+ %7 = affine.apply #map()[%workgroup_count_x]
+ scf.for %arg2 = %6 to %c512 step %7 {
+ %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>
+ %subview_0 = memref.subview %0[%arg1, %arg0, 0] [32, 1, 64] [1, 1, 1] : memref<128x32x64xf16> to memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>
+ %subview_1 = memref.subview %1[%arg0, 0, %arg2] [1, 64, 32] [1, 1, 1] : memref<32x64x512xf16> to memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>
+ linalg.fill ins(%cst : f16) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>)
+ linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%subview_0, %subview_1 : memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>, memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) attrs = {lowering_config = #config} {
+ ^bb0(%in: f16, %in_3: f16, %out: f16):
+ %8 = arith.mulf %in, %in_3 : f16
+ %9 = arith.addf %out, %8 : f16
+ linalg.yield %9 : f16
+ }
+ %subview_2 = memref.subview %3[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>
+ linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%subview_2 : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) {
+ ^bb0(%in: f16, %out: f16):
+ %8 = arith.divf %out, %in : f16
+ linalg.yield %8 : f16
}
}
- return
}
}
+ return
}
}
@@ -379,94 +250,57 @@
// PROMOTEC-SAME: ins(%[[LHS_VIEW]], %[[RHS_VIEW]]
// PROMOTEC-SAME: __internal_linalg_transform__ = "workgroup_memory"
-
// -----
// No need to promote C if there is no fused element wise ops.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
#config = #iree_codegen.lowering_config<tile_sizes = [[1, 32, 32, 32], [1, 16, 16, 16], [0, 0, 0, 32]]>
-hal.executable @generic_batch_matmul_f16_32x128x512x64 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = i8, b_type = i8, c_type = i32, k_size = 32,
- m_size = 8, n_size = 8, result_type = i32, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>,
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f32, k_size = 16,
- m_size = 16, n_size = 16, result_type = f32, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [2147483647, 65535, 65535],
- subgroup_size = 32>
- >}>) {
- hal.executable.export public @generic_batch_matmul_f16_32x128x512x64 ordinal(0) layout(#pipeline_layout) attributes {
- translation_info = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize>,
- workgroup_size = [64 : index, 2 : index, 1 : index]
- }
- builtin.module {
- func.func @generic_batch_matmul_f16_32x128x512x64() {
- %c32 = arith.constant 32 : index
- %c128 = arith.constant 128 : index
- %c512 = arith.constant 512 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32x64xf16>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64x512xf16>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %workgroup_id_z = hal.interface.workgroup.id[2] : index
- %workgroup_count_z = hal.interface.workgroup.count[2] : index
- scf.for %arg0 = %workgroup_id_z to %c32 step %workgroup_count_z {
- %3 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
- %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_y]
- scf.for %arg1 = %3 to %c128 step %4 {
- %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
- %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
- scf.for %arg2 = %5 to %c512 step %6 {
- %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>
- %subview_0 = memref.subview %0[%arg1, %arg0, 0] [32, 1, 64] [1, 1, 1] : memref<128x32x64xf16> to memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>
- %subview_1 = memref.subview %1[%arg0, 0, %arg2] [1, 64, 32] [1, 1, 1] : memref<32x64x512xf16> to memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>
- linalg.fill ins(%cst : f16) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>)
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
- ins(%subview_0, %subview_1 : memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>, memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>)
- outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>)
- attrs = {lowering_config = #config} {
- ^bb0(%in: f16, %in_2: f16, %out: f16):
- %7 = arith.mulf %in, %in_2 : f16
- %8 = arith.addf %out, %7 : f16
- linalg.yield %8 : f16
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_KHR_variable_pointers, SPV_KHR_cooperative_matrix]>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [2147483647, 65535, 65535], cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>, #spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, acc_sat = false, scope = <Subgroup>>]>>}>
+#map = affine_map<()[s0] -> (s0 * 32)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [64, 2, 1]>
+module {
+ func.func @generic_batch_matmul_f16_32x128x512x64() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb, translation_info = #translation} {
+ %c32 = arith.constant 32 : index
+ %c128 = arith.constant 128 : index
+ %c512 = arith.constant 512 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32x64xf16>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64x512xf16>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %workgroup_id_z = hal.interface.workgroup.id[2] : index
+ %workgroup_count_z = hal.interface.workgroup.count[2] : index
+ scf.for %arg0 = %workgroup_id_z to %c32 step %workgroup_count_z {
+ %3 = affine.apply #map()[%workgroup_id_y]
+ %4 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg1 = %3 to %c128 step %4 {
+ %5 = affine.apply #map()[%workgroup_id_x]
+ %6 = affine.apply #map()[%workgroup_count_x]
+ scf.for %arg2 = %5 to %c512 step %6 {
+ %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>
+ %subview_0 = memref.subview %0[%arg1, %arg0, 0] [32, 1, 64] [1, 1, 1] : memref<128x32x64xf16> to memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>
+ %subview_1 = memref.subview %1[%arg0, 0, %arg2] [1, 64, 32] [1, 1, 1] : memref<32x64x512xf16> to memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>
+ linalg.fill ins(%cst : f16) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>)
+ linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%subview_0, %subview_1 : memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>, memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) attrs = {lowering_config = #config} {
+ ^bb0(%in: f16, %in_2: f16, %out: f16):
+ %7 = arith.mulf %in, %in_2 : f16
+ %8 = arith.addf %out, %7 : f16
+ linalg.yield %8 : f16
}
}
- return
}
}
+ return
}
}
-
// PROMOTEC-LABEL: func.func @generic_batch_matmul_f16_32x128x512x64()
// PROMOTEC-NOT: memref.alloc()
@@ -501,76 +335,44 @@
// No need to promote again with allocations from bufferization.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
#config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]]>
-
-hal.executable @batch_matmul_f16_1x64x128x512 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_NV_cooperative_matrix]>, AMD:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>
- >}>) {
- hal.executable.export public @batch_matmul_f16_1x64x128x512 ordinal(0) layout(#pipeline_layout) attributes {
- translation_info = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize>,
- workgroup_size = [128 : index, 2 : index, 1 : index]
- }
- builtin.module {
- func.func @batch_matmul_f16_1x64x128x512() {
- %c4096 = arith.constant 4096 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<1x4096x512xf16>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x512x4096xf16>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<1x4096x4096xf32>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
- %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
- scf.for %arg0 = %3 to %c4096 step %4 {
- %5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
- %6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
- scf.for %arg1 = %5 to %c4096 step %6 {
- %subview = memref.subview %2[0, %arg0, %arg1] [1, 64, 128] [1, 1, 1] : memref<1x4096x4096xf32> to memref<1x64x128xf32, strided<[16777216, 4096, 1], offset: ?>>
- %subview_0 = memref.subview %0[0, %arg0, 0] [1, 64, 512] [1, 1, 1] : memref<1x4096x512xf16> to memref<1x64x512xf16, strided<[2097152, 512, 1], offset: ?>>
- %subview_1 = memref.subview %1[0, 0, %arg1] [1, 512, 128] [1, 1, 1] : memref<1x512x4096xf16> to memref<1x512x128xf16, strided<[2097152, 4096, 1], offset: ?>>
- %alloc = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, #gpu.address_space<workgroup>>
- linalg.fill ins(%cst : f16) outs(%alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>)
- linalg.batch_matmul {lowering_config = #config}
- ins(%subview_0, %subview_1 : memref<1x64x512xf16, strided<[2097152, 512, 1], offset: ?>>, memref<1x512x128xf16, strided<[2097152, 4096, 1], offset: ?>>)
- outs(%alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>)
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
- iterator_types = ["parallel", "parallel", "parallel"]}
- ins(%alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>)
- outs(%subview : memref<1x64x128xf32, strided<[16777216, 4096, 1], offset: ?>>) {
- ^bb0(%in: f16, %out: f32):
- %7 = arith.extf %in : f16 to f32
- linalg.yield %7 : f32
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_NV_cooperative_matrix]>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>]>>}>
+#map = affine_map<()[s0] -> (s0 * 64)>
+#map1 = affine_map<()[s0] -> (s0 * 128)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#translation = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [128, 2, 1]>
+module {
+ func.func @batch_matmul_f16_1x64x128x512() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb, translation_info = #translation} {
+ %c4096 = arith.constant 4096 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<1x4096x512xf16>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x512x4096xf16>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<1x4096x4096xf32>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %3 = affine.apply #map()[%workgroup_id_y]
+ %4 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg0 = %3 to %c4096 step %4 {
+ %5 = affine.apply #map1()[%workgroup_id_x]
+ %6 = affine.apply #map1()[%workgroup_count_x]
+ scf.for %arg1 = %5 to %c4096 step %6 {
+ %subview = memref.subview %2[0, %arg0, %arg1] [1, 64, 128] [1, 1, 1] : memref<1x4096x4096xf32> to memref<1x64x128xf32, strided<[16777216, 4096, 1], offset: ?>>
+ %subview_0 = memref.subview %0[0, %arg0, 0] [1, 64, 512] [1, 1, 1] : memref<1x4096x512xf16> to memref<1x64x512xf16, strided<[2097152, 512, 1], offset: ?>>
+ %subview_1 = memref.subview %1[0, 0, %arg1] [1, 512, 128] [1, 1, 1] : memref<1x512x4096xf16> to memref<1x512x128xf16, strided<[2097152, 4096, 1], offset: ?>>
+ %alloc = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, #gpu.address_space<workgroup>>
+ linalg.fill ins(%cst : f16) outs(%alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>)
+ linalg.batch_matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<1x64x512xf16, strided<[2097152, 512, 1], offset: ?>>, memref<1x512x128xf16, strided<[2097152, 4096, 1], offset: ?>>) outs(%alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>)
+ linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%alloc : memref<1x64x128xf16, #gpu.address_space<workgroup>>) outs(%subview : memref<1x64x128xf32, strided<[16777216, 4096, 1], offset: ?>>) {
+ ^bb0(%in: f16, %out: f32):
+ %7 = arith.extf %in : f16 to f32
+ linalg.yield %7 : f32
}
- return
}
}
+ return
}
}
@@ -605,81 +407,47 @@
// PROMOTEC: gpu.barrier
// -----
-
-// Broadcasted elementwise ops does not need promoting C matrix.
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
#config = #iree_codegen.lowering_config<tile_sizes = [[64, 128], [32, 64], [0, 0, 32], [16, 16, 16]]>
-
-hal.executable @matmul_f16_f512x4096x64 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_NV_cooperative_matrix]>, AMD:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>
- >}>) {
- hal.executable.export public @matmul_f16_f512x4096x64 ordinal(0) layout(#pipeline_layout) attributes {
- translation_info = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize>,
- workgroup_size = [128 : index, 2 : index, 1 : index]
- }
- builtin.module {
- func.func @matmul_f16_f512x4096x64() {
- %c512 = arith.constant 512 : index
- %c4096 = arith.constant 4096 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<512x64xf16>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<64x4096xf16>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<4096xf16>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<512x4096xf16>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
- %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
- scf.for %arg0 = %4 to %c512 step %5 {
- %6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
- %7 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
- scf.for %arg1 = %6 to %c4096 step %7 {
- %subview = memref.subview %3[%arg0, %arg1] [64, 128] [1, 1] : memref<512x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>>
- %subview_0 = memref.subview %0[%arg0, 0] [64, 64] [1, 1] : memref<512x64xf16> to memref<64x64xf16, strided<[64, 1], offset: ?>>
- %subview_1 = memref.subview %1[0, %arg1] [64, 128] [1, 1] : memref<64x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>>
- linalg.fill ins(%cst : f16) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>)
- linalg.matmul {lowering_config = #config}
- ins(%subview_0, %subview_1 : memref<64x64xf16, strided<[64, 1], offset: ?>>, memref<64x128xf16, strided<[4096, 1], offset: ?>>)
- outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>)
- %subview_2 = memref.subview %2[%arg1] [128] [1] : memref<4096xf16> to memref<128xf16, strided<[1], offset: ?>>
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%subview_2 : memref<128xf16, strided<[1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) {
- ^bb0(%in: f16, %out: f16):
- %8 = arith.addf %out, %in : f16
- linalg.yield %8 : f16
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_NV_cooperative_matrix]>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>]>>}>
+#map = affine_map<()[s0] -> (s0 * 64)>
+#map1 = affine_map<()[s0] -> (s0 * 128)>
+#map2 = affine_map<(d0, d1) -> (d1)>
+#map3 = affine_map<(d0, d1) -> (d0, d1)>
+#translation = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [128, 2, 1]>
+module {
+ func.func @matmul_f16_f512x4096x64() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb, translation_info = #translation} {
+ %c512 = arith.constant 512 : index
+ %c4096 = arith.constant 4096 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<512x64xf16>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<64x4096xf16>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<4096xf16>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<512x4096xf16>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %4 = affine.apply #map()[%workgroup_id_y]
+ %5 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg0 = %4 to %c512 step %5 {
+ %6 = affine.apply #map1()[%workgroup_id_x]
+ %7 = affine.apply #map1()[%workgroup_count_x]
+ scf.for %arg1 = %6 to %c4096 step %7 {
+ %subview = memref.subview %3[%arg0, %arg1] [64, 128] [1, 1] : memref<512x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>>
+ %subview_0 = memref.subview %0[%arg0, 0] [64, 64] [1, 1] : memref<512x64xf16> to memref<64x64xf16, strided<[64, 1], offset: ?>>
+ %subview_1 = memref.subview %1[0, %arg1] [64, 128] [1, 1] : memref<64x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>>
+ linalg.fill ins(%cst : f16) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>)
+ linalg.matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<64x64xf16, strided<[64, 1], offset: ?>>, memref<64x128xf16, strided<[4096, 1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>)
+ %subview_2 = memref.subview %2[%arg1] [128] [1] : memref<4096xf16> to memref<128xf16, strided<[1], offset: ?>>
+ linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%subview_2 : memref<128xf16, strided<[1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) {
+ ^bb0(%in: f16, %out: f16):
+ %8 = arith.addf %out, %in : f16
+ linalg.yield %8 : f16
}
- return
}
}
+ return
}
}
@@ -725,79 +493,47 @@
// Transposed+broadcasted elementwise ops does not need promoting C matrix.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
#config = #iree_codegen.lowering_config<tile_sizes = [[64, 128], [32, 64], [0, 0, 32], [16, 16, 16]]>
-
-hal.executable @matmul_f16_f512x4096x64 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_NV_cooperative_matrix]>, AMD:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>
- >}>) {
- hal.executable.export public @matmul_f16_f512x4096x64 ordinal(0) layout(#pipeline_layout) attributes {
- translation_info = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize>,
- workgroup_size = [128 : index, 2 : index, 1 : index]
- }
- builtin.module {
- func.func @matmul_f16_f512x4096x64() {
- %c512 = arith.constant 512 : index
- %c4096 = arith.constant 4096 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f16
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<512x64xf16>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<64x4096xf16>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<512xf16>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<512x4096xf16>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
- %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
- scf.for %arg0 = %4 to %c512 step %5 {
- %6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
- %7 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
- scf.for %arg1 = %6 to %c4096 step %7 {
- %subview = memref.subview %3[%arg0, %arg1] [64, 128] [1, 1] : memref<512x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>>
- %subview_0 = memref.subview %0[%arg0, 0] [64, 64] [1, 1] : memref<512x64xf16> to memref<64x64xf16, strided<[64, 1], offset: ?>>
- %subview_1 = memref.subview %1[0, %arg1] [64, 128] [1, 1] : memref<64x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>>
- linalg.fill ins(%cst : f16) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>)
- linalg.matmul {lowering_config = #config}
- ins(%subview_0, %subview_1 : memref<64x64xf16, strided<[64, 1], offset: ?>>, memref<64x128xf16, strided<[4096, 1], offset: ?>>)
- outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>)
- %subview_2 = memref.subview %2[%arg0] [64] [1] : memref<512xf16> to memref<64xf16, strided<[1], offset: ?>>
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%subview_2 : memref<64xf16, strided<[1], offset: ?>>)
- outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) {
- ^bb0(%in: f16, %out: f16):
- %8 = arith.addf %out, %in : f16
- linalg.yield %8 : f16
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_NV_cooperative_matrix]>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>]>>}>
+#map = affine_map<()[s0] -> (s0 * 64)>
+#map1 = affine_map<()[s0] -> (s0 * 128)>
+#map2 = affine_map<(d0, d1) -> (d0)>
+#map3 = affine_map<(d0, d1) -> (d0, d1)>
+#translation = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [128, 2, 1]>
+module {
+ func.func @matmul_f16_f512x4096x64() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb, translation_info = #translation} {
+ %c512 = arith.constant 512 : index
+ %c4096 = arith.constant 4096 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<512x64xf16>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<64x4096xf16>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<512xf16>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<512x4096xf16>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %4 = affine.apply #map()[%workgroup_id_y]
+ %5 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg0 = %4 to %c512 step %5 {
+ %6 = affine.apply #map1()[%workgroup_id_x]
+ %7 = affine.apply #map1()[%workgroup_count_x]
+ scf.for %arg1 = %6 to %c4096 step %7 {
+ %subview = memref.subview %3[%arg0, %arg1] [64, 128] [1, 1] : memref<512x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>>
+ %subview_0 = memref.subview %0[%arg0, 0] [64, 64] [1, 1] : memref<512x64xf16> to memref<64x64xf16, strided<[64, 1], offset: ?>>
+ %subview_1 = memref.subview %1[0, %arg1] [64, 128] [1, 1] : memref<64x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>>
+ linalg.fill ins(%cst : f16) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>)
+ linalg.matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<64x64xf16, strided<[64, 1], offset: ?>>, memref<64x128xf16, strided<[4096, 1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>)
+ %subview_2 = memref.subview %2[%arg0] [64] [1] : memref<512xf16> to memref<64xf16, strided<[1], offset: ?>>
+ linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%subview_2 : memref<64xf16, strided<[1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) {
+ ^bb0(%in: f16, %out: f16):
+ %8 = arith.addf %out, %in : f16
+ linalg.yield %8 : f16
}
- return
}
}
+ return
}
}
@@ -843,78 +579,50 @@
// Inlined large constant array needs promoting C matrix.
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
#config = #iree_codegen.lowering_config<tile_sizes = [[64, 128], [32, 64], [0, 0, 32], [16, 16, 16]]>
-
-hal.executable @matmul_f16_128x262144x2304 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<
- #spirv.vce<v1.6,
- [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR],
- [SPV_NV_cooperative_matrix]>, AMD:DiscreteGPU,
- #spirv.resource_limits<
- cooperative_matrix_properties_khr = [
- #spirv.coop_matrix_props_khr<
- a_type = f16, b_type = f16, c_type = f16, k_size = 16,
- m_size = 16, n_size = 16, result_type = f16, acc_sat = false, scope = <Subgroup>>
- ],
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>
- >}>) {
- hal.executable.export public @matmul_f16_128x262144x2304 ordinal(0) layout(#pipeline_layout) attributes {
- translation_info = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize>,
- workgroup_size = [128 : index, 2 : index, 1 : index]
- }
- builtin.module {
- func.func @matmul_f16_128x262144x2304() {
- %c128 = arith.constant 128 : index
- %c262144 = arith.constant 262144 : index
- %c96565312 = arith.constant 96565312 : index
- %c806357120 = arith.constant 806357120 : index
- %c134217728 = arith.constant 134217728 : index
- %cst = arith.constant 0.000000e+00 : f16
- %cst_0 = arith.constant dense<"0x69222B2E40A3002A45AC1AAB2E2E202DA21C212680264C2A102314A041A7D029CB28352E5BAAD3B02F299D9A142B8AA1D1285C28412B25AF9A24EE2BA22C242D53AD9E2948A9289FCF301D28012F08AD68A6DD20ECAC912465290B2E9420C5AA50A222A912AB9526B62ADA2039AD4D912C9FDD287B20B224D329BA2A4D2C41A76DAB7E30B027F62ED1A0F1273A2BAE9D0FA48029812992A65AA92A2C9C2EE9A744A4632C5FA8A9A4CF2D70A482A0F5A2DBA7B6304B9D22A52B1B9DA8E424722AB5ACD0248A2B8B29C82D782E402D1A99F0A60CA4DE2DD32815266F2A6B247FA6FE214E2853AA402390AB6925F1A339307F2664A23CACBE28BA2B3D286DB0BA2E"> : tensor<128xf16>
- %0 = bufferization.to_memref %cst_0 : memref<128xf16>
- %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c96565312) : memref<128x2304xf16>
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c806357120) : memref<2304x262144xf16>
- %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c134217728) : memref<128x262144xf16>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
- %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
- scf.for %arg0 = %4 to %c128 step %5 {
- %6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
- %7 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
- scf.for %arg1 = %6 to %c262144 step %7 {
- %subview = memref.subview %3[%arg0, %arg1] [64, 128] [1, 1] : memref<128x262144xf16> to memref<64x128xf16, strided<[262144, 1], offset: ?>>
- %subview_1 = memref.subview %1[%arg0, 0] [64, 2304] [1, 1] : memref<128x2304xf16> to memref<64x2304xf16, strided<[2304, 1], offset: ?>>
- %subview_2 = memref.subview %2[0, %arg1] [2304, 128] [1, 1] : memref<2304x262144xf16> to memref<2304x128xf16, strided<[262144, 1], offset: ?>>
- linalg.fill ins(%cst : f16) outs(%subview : memref<64x128xf16, strided<[262144, 1], offset: ?>>)
- linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 128], [32, 64], [0, 0, 32], [16, 16, 16]]>}
- ins(%subview_1, %subview_2 : memref<64x2304xf16, strided<[2304, 1], offset: ?>>, memref<2304x128xf16, strided<[262144, 1], offset: ?>>)
- outs(%subview : memref<64x128xf16, strided<[262144, 1], offset: ?>>)
- %subview_3 = memref.subview %0[%arg0] [64] [1] : memref<128xf16> to memref<64xf16, strided<[1], offset: ?>>
- linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]}
- ins(%subview_3 : memref<64xf16, strided<[1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[262144, 1], offset: ?>>) {
- ^bb0(%in: f16, %out: f16):
- %8 = arith.addf %out, %in : f16
- linalg.yield %8 : f16
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16, StorageBuffer16BitAccess, StorageUniform16, CooperativeMatrixKHR], [SPV_NV_cooperative_matrix]>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64, cooperative_matrix_properties_khr = [#spirv.coop_matrix_props_khr<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>>]>>}>
+#map = affine_map<()[s0] -> (s0 * 64)>
+#map1 = affine_map<()[s0] -> (s0 * 128)>
+#map2 = affine_map<(d0, d1) -> (d0)>
+#map3 = affine_map<(d0, d1) -> (d0, d1)>
+#translation = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize workgroup_size = [128, 2, 1]>
+module {
+ func.func @matmul_f16_128x262144x2304() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb, translation_info = #translation} {
+ %c128 = arith.constant 128 : index
+ %c262144 = arith.constant 262144 : index
+ %c96565312 = arith.constant 96565312 : index
+ %c806357120 = arith.constant 806357120 : index
+ %c134217728 = arith.constant 134217728 : index
+ %cst = arith.constant 0.000000e+00 : f16
+ %cst_0 = arith.constant dense<"0x69222B2E40A3002A45AC1AAB2E2E202DA21C212680264C2A102314A041A7D029CB28352E5BAAD3B02F299D9A142B8AA1D1285C28412B25AF9A24EE2BA22C242D53AD9E2948A9289FCF301D28012F08AD68A6DD20ECAC912465290B2E9420C5AA50A222A912AB9526B62ADA2039AD4D912C9FDD287B20B224D329BA2A4D2C41A76DAB7E30B027F62ED1A0F1273A2BAE9D0FA48029812992A65AA92A2C9C2EE9A744A4632C5FA8A9A4CF2D70A482A0F5A2DBA7B6304B9D22A52B1B9DA8E424722AB5ACD0248A2B8B29C82D782E402D1A99F0A60CA4DE2DD32815266F2A6B247FA6FE214E2853AA402390AB6925F1A339307F2664A23CACBE28BA2B3D286DB0BA2E"> : tensor<128xf16>
+ %0 = bufferization.to_memref %cst_0 : memref<128xf16>
+ %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c96565312) : memref<128x2304xf16>
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c806357120) : memref<2304x262144xf16>
+ %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c134217728) : memref<128x262144xf16>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %4 = affine.apply #map()[%workgroup_id_y]
+ %5 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg0 = %4 to %c128 step %5 {
+ %6 = affine.apply #map1()[%workgroup_id_x]
+ %7 = affine.apply #map1()[%workgroup_count_x]
+ scf.for %arg1 = %6 to %c262144 step %7 {
+ %subview = memref.subview %3[%arg0, %arg1] [64, 128] [1, 1] : memref<128x262144xf16> to memref<64x128xf16, strided<[262144, 1], offset: ?>>
+ %subview_1 = memref.subview %1[%arg0, 0] [64, 2304] [1, 1] : memref<128x2304xf16> to memref<64x2304xf16, strided<[2304, 1], offset: ?>>
+ %subview_2 = memref.subview %2[0, %arg1] [2304, 128] [1, 1] : memref<2304x262144xf16> to memref<2304x128xf16, strided<[262144, 1], offset: ?>>
+ linalg.fill ins(%cst : f16) outs(%subview : memref<64x128xf16, strided<[262144, 1], offset: ?>>)
+ linalg.matmul {lowering_config = #config} ins(%subview_1, %subview_2 : memref<64x2304xf16, strided<[2304, 1], offset: ?>>, memref<2304x128xf16, strided<[262144, 1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[262144, 1], offset: ?>>)
+ %subview_3 = memref.subview %0[%arg0] [64] [1] : memref<128xf16> to memref<64xf16, strided<[1], offset: ?>>
+ linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%subview_3 : memref<64xf16, strided<[1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[262144, 1], offset: ?>>) {
+ ^bb0(%in: f16, %out: f16):
+ %8 = arith.addf %out, %in : f16
+ linalg.yield %8 : f16
}
- return
}
}
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_matmul.mlir
index 0cc2d76..41c0e7c 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_matmul.mlir
@@ -1,69 +1,46 @@
-// RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-spirv-tile-and-promote, cse)))))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline='builtin.module(func.func(iree-spirv-tile-and-promote, cse))' %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
#config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [16, 4], [0, 0, 32]]>
-
-hal.executable @matmul_f32_256x1024x128 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.5, [Shader], []>, NVIDIA:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 49152,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [65535, 65535, 65535],
- subgroup_size = 32>>}>) {
- hal.executable.export public @matmul_f32_256x1024x128 ordinal(0) layout(#pipeline_layout) attributes {
- translation_info = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize>,
- workgroup_size = [32 : index, 8 : index, 1 : index]
- }
- builtin.module {
- func.func @matmul_f32_256x1024x128() {
- %c1024 = arith.constant 1024 : index
- %c256 = arith.constant 256 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<256x128xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128x1024xf32>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<256x1024xf32>
- %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<256x1024xf32>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_y]
- %5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_y]
- scf.for %arg0 = %4 to %c256 step %5 {
- %6 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
- %7 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
- scf.for %arg1 = %6 to %c1024 step %7 {
- %8 = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<256x1024xf32> to memref<128x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
- %9 = memref.subview %0[%arg0, 0] [128, 128] [1, 1] : memref<256x128xf32> to memref<128x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
- %10 = memref.subview %1[0, %arg1] [128, 128] [1, 1] : memref<128x1024xf32> to memref<128x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
- %11 = memref.subview %3[%arg0, %arg1] [128, 128] [1, 1] : memref<256x1024xf32> to memref<128x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>
- linalg.fill
- ins(%cst : f32) outs(%11 : memref<128x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>)
- linalg.matmul {lowering_config = #config}
- ins(%9, %10 : memref<128x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>, memref<128x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>)
- outs(%11 : memref<128x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>)
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%11, %8 : memref<128x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>, memref<128x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>)
- outs(%11 : memref<128x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>>) {
- ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):
- %12 = arith.divf %arg2, %arg3 : f32
- linalg.yield %12 : f32
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.5, [Shader], []>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [65535, 65535, 65535]>>}>
+#map = affine_map<()[s0] -> (s0 * 128)>
+#map1 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
+#map2 = affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>
+#map3 = affine_map<(d0, d1) -> (d0, d1)>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [32, 8, 1]>
+module {
+ func.func @matmul_f32_256x1024x128() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb, translation_info = #translation} {
+ %c1024 = arith.constant 1024 : index
+ %c256 = arith.constant 256 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<256x128xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128x1024xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<256x1024xf32>
+ %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<256x1024xf32>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %4 = affine.apply #map()[%workgroup_id_y]
+ %5 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg0 = %4 to %c256 step %5 {
+ %6 = affine.apply #map()[%workgroup_id_x]
+ %7 = affine.apply #map()[%workgroup_count_x]
+ scf.for %arg1 = %6 to %c1024 step %7 {
+ %subview = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<256x1024xf32> to memref<128x128xf32, #map1>
+ %subview_0 = memref.subview %0[%arg0, 0] [128, 128] [1, 1] : memref<256x128xf32> to memref<128x128xf32, #map2>
+ %subview_1 = memref.subview %1[0, %arg1] [128, 128] [1, 1] : memref<128x1024xf32> to memref<128x128xf32, #map1>
+ %subview_2 = memref.subview %3[%arg0, %arg1] [128, 128] [1, 1] : memref<256x1024xf32> to memref<128x128xf32, #map1>
+ linalg.fill ins(%cst : f32) outs(%subview_2 : memref<128x128xf32, #map1>)
+ linalg.matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<128x128xf32, #map2>, memref<128x128xf32, #map1>) outs(%subview_2 : memref<128x128xf32, #map1>)
+ linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%subview_2, %subview : memref<128x128xf32, #map1>, memref<128x128xf32, #map1>) outs(%subview_2 : memref<128x128xf32, #map1>) {
+ ^bb0(%in: f32, %in_3: f32, %out: f32):
+ %8 = arith.divf %in, %in_3 : f32
+ linalg.yield %8 : f32
}
- return
}
}
+ return
}
}
@@ -129,72 +106,50 @@
// CHECK-SAME: outs(%[[VIEW_C]]
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>
- ]>
-]>
#config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 256], [1, 8, 8], [0, 0, 0, 16]]>
-
-hal.executable @batch_matmul_16x1024x1024x80 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16], []>, AMD:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>}>) {
- hal.executable.export public @batch_matmul_16x1024x1024x80 ordinal(0) layout(#pipeline_layout) attributes {
- translation_info = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize>,
- workgroup_size = [32 : index, 8 : index, 1 : index]
- }
- builtin.module {
- func.func @batch_matmul_16x1024x1024x80() {
- %c0 = arith.constant 0 : index
- %c16 = arith.constant 16 : index
- %c1024 = arith.constant 1024 : index
- %cst = arith.constant 0.111803398 : f32
- %cst_0 = arith.constant 0.000000e+00 : f16
- %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<16x1024x80xf16>
- %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<16x80x1024xf16>
- %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<16x1024x1024xf16>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %workgroup_id_z = hal.interface.workgroup.id[2] : index
- %workgroup_count_z = hal.interface.workgroup.count[2] : index
- scf.for %arg0 = %workgroup_id_z to %c16 step %workgroup_count_z {
- %9 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
- %10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
- scf.for %arg1 = %9 to %c1024 step %10 {
- %11 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_id_x]
- %12 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_count_x]
- scf.for %arg2 = %11 to %c1024 step %12 {
- %subview = memref.subview %8[%arg0, %arg1, %arg2] [1, 64, 256] [1, 1, 1] : memref<16x1024x1024xf16> to memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>>
- %subview_1 = memref.subview %6[%arg0, %arg1, 0] [1, 64, 80] [1, 1, 1] : memref<16x1024x80xf16> to memref<1x64x80xf16, strided<[81920, 80, 1], offset: ?>>
- %subview_2 = memref.subview %7[%arg0, 0, %arg2] [1, 80, 256] [1, 1, 1] : memref<16x80x1024xf16> to memref<1x80x256xf16, strided<[81920, 1024, 1], offset: ?>>
- linalg.fill
- ins(%cst_0 : f16) outs(%subview : memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>>)
- linalg.batch_matmul {lowering_config = #config}
- ins(%subview_1, %subview_2 : memref<1x64x80xf16, strided<[81920, 80, 1], offset: ?>>, memref<1x80x256xf16, strided<[81920, 1024, 1], offset: ?>>)
- outs(%subview : memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>>)
- linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]}
- outs(%subview : memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>>) {
- ^bb0(%out: f16):
- %13 = arith.truncf %cst : f32 to f16
- %14 = arith.mulf %out, %13 : f16
- linalg.yield %14 : f16
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float16], []>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+#map = affine_map<()[s0] -> (s0 * 64)>
+#map1 = affine_map<()[s0] -> (s0 * 256)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [32, 8, 1]>
+module {
+ func.func @batch_matmul_16x1024x1024x80() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb, translation_info = #translation} {
+ %c0 = arith.constant 0 : index
+ %c16 = arith.constant 16 : index
+ %c1024 = arith.constant 1024 : index
+ %cst = arith.constant 0.111803398 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f16
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<16x1024x80xf16>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<16x80x1024xf16>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<16x1024x1024xf16>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %workgroup_id_z = hal.interface.workgroup.id[2] : index
+ %workgroup_count_z = hal.interface.workgroup.count[2] : index
+ scf.for %arg0 = %workgroup_id_z to %c16 step %workgroup_count_z {
+ %3 = affine.apply #map()[%workgroup_id_y]
+ %4 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg1 = %3 to %c1024 step %4 {
+ %5 = affine.apply #map1()[%workgroup_id_x]
+ %6 = affine.apply #map1()[%workgroup_count_x]
+ scf.for %arg2 = %5 to %c1024 step %6 {
+ %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 64, 256] [1, 1, 1] : memref<16x1024x1024xf16> to memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>>
+ %subview_1 = memref.subview %0[%arg0, %arg1, 0] [1, 64, 80] [1, 1, 1] : memref<16x1024x80xf16> to memref<1x64x80xf16, strided<[81920, 80, 1], offset: ?>>
+ %subview_2 = memref.subview %1[%arg0, 0, %arg2] [1, 80, 256] [1, 1, 1] : memref<16x80x1024xf16> to memref<1x80x256xf16, strided<[81920, 1024, 1], offset: ?>>
+ linalg.fill ins(%cst_0 : f16) outs(%subview : memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>>)
+ linalg.batch_matmul {lowering_config = #config} ins(%subview_1, %subview_2 : memref<1x64x80xf16, strided<[81920, 80, 1], offset: ?>>, memref<1x80x256xf16, strided<[81920, 1024, 1], offset: ?>>) outs(%subview : memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>>)
+ linalg.generic {indexing_maps = [#map2], iterator_types = ["parallel", "parallel", "parallel"]} outs(%subview : memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>>) {
+ ^bb0(%out: f16):
+ %7 = arith.truncf %cst : f32 to f16
+ %8 = arith.mulf %out, %7 : f16
+ linalg.yield %8 : f16
}
}
- return
}
}
+ return
}
}
@@ -213,64 +168,43 @@
// CHECK: gpu.barrier
// -----
-
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
#config = #iree_codegen.lowering_config<tile_sizes = [[1, 512, 8], [1, 8, 4], [0, 0, 0, 16]]>
-
-hal.executable @batch_matmul_f32_16x4096x40x4096 {
- hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", {
- spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader], []>, AMD:DiscreteGPU, #spirv.resource_limits<
- max_compute_shared_memory_size = 65536,
- max_compute_workgroup_invocations = 1024,
- max_compute_workgroup_size = [1024, 1024, 1024],
- subgroup_size = 64>>}>) {
- hal.executable.export public @batch_matmul_f32_16x4096x40x4096 ordinal(0) layout(#pipeline_layout) attributes {
- translation_info = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize>,
- workgroup_size = [2 : index, 64 : index, 1 : index]
- }
- builtin.module {
- func.func @batch_matmul_f32_16x4096x40x4096() {
- %c16 = arith.constant 16 : index
- %c4096 = arith.constant 4096 : index
- %c40 = arith.constant 40 : index
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<16x4096x4096xf32>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<16x4096x40xf32>
- %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<16x4096x40xf32>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %workgroup_id_z = hal.interface.workgroup.id[2] : index
- %workgroup_count_z = hal.interface.workgroup.count[2] : index
- scf.for %arg0 = %workgroup_id_z to %c16 step %workgroup_count_z {
- %3 = affine.apply affine_map<()[s0] -> (s0 * 512)>()[%workgroup_id_y]
- %4 = affine.apply affine_map<()[s0] -> (s0 * 512)>()[%workgroup_count_y]
- scf.for %arg1 = %3 to %c4096 step %4 {
- %5 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_id_x]
- %6 = affine.apply affine_map<()[s0] -> (s0 * 8)>()[%workgroup_count_x]
- scf.for %arg2 = %5 to %c40 step %6 {
- %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 512, 8] [1, 1, 1] : memref<16x4096x40xf32> to memref<1x512x8xf32, strided<[163840, 40, 1], offset: ?>>
- %subview_0 = memref.subview %0[%arg0, %arg1, 0] [1, 512, 4096] [1, 1, 1] : memref<16x4096x4096xf32> to memref<1x512x4096xf32, strided<[16777216, 4096, 1], offset: ?>>
- %subview_1 = memref.subview %1[%arg0, 0, %arg2] [1, 4096, 8] [1, 1, 1] : memref<16x4096x40xf32> to memref<1x4096x8xf32, strided<[163840, 40, 1], offset: ?>>
- linalg.fill
- ins(%cst : f32) outs(%subview : memref<1x512x8xf32, strided<[163840, 40, 1], offset: ?>>)
- linalg.batch_matmul {lowering_config = #config}
- ins(%subview_0, %subview_1 : memref<1x512x4096xf32, strided<[16777216, 4096, 1], offset: ?>>, memref<1x4096x8xf32, strided<[163840, 40, 1], offset: ?>>)
- outs(%subview : memref<1x512x8xf32, strided<[163840, 40, 1], offset: ?>>)
- }
- }
+#executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader], []>, AMD:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 65536, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 1024], subgroup_size = 64>>}>
+#map = affine_map<()[s0] -> (s0 * 512)>
+#map1 = affine_map<()[s0] -> (s0 * 8)>
+#translation = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize workgroup_size = [2, 64, 1]>
+module {
+ func.func @batch_matmul_f32_16x4096x40x4096() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb, translation_info = #translation} {
+ %c16 = arith.constant 16 : index
+ %c4096 = arith.constant 4096 : index
+ %c40 = arith.constant 40 : index
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<16x4096x4096xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<16x4096x40xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<16x4096x40xf32>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %workgroup_id_z = hal.interface.workgroup.id[2] : index
+ %workgroup_count_z = hal.interface.workgroup.count[2] : index
+ scf.for %arg0 = %workgroup_id_z to %c16 step %workgroup_count_z {
+ %3 = affine.apply #map()[%workgroup_id_y]
+ %4 = affine.apply #map()[%workgroup_count_y]
+ scf.for %arg1 = %3 to %c4096 step %4 {
+ %5 = affine.apply #map1()[%workgroup_id_x]
+ %6 = affine.apply #map1()[%workgroup_count_x]
+ scf.for %arg2 = %5 to %c40 step %6 {
+ %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 512, 8] [1, 1, 1] : memref<16x4096x40xf32> to memref<1x512x8xf32, strided<[163840, 40, 1], offset: ?>>
+ %subview_0 = memref.subview %0[%arg0, %arg1, 0] [1, 512, 4096] [1, 1, 1] : memref<16x4096x4096xf32> to memref<1x512x4096xf32, strided<[16777216, 4096, 1], offset: ?>>
+ %subview_1 = memref.subview %1[%arg0, 0, %arg2] [1, 4096, 8] [1, 1, 1] : memref<16x4096x40xf32> to memref<1x4096x8xf32, strided<[163840, 40, 1], offset: ?>>
+ linalg.fill ins(%cst : f32) outs(%subview : memref<1x512x8xf32, strided<[163840, 40, 1], offset: ?>>)
+ linalg.batch_matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<1x512x4096xf32, strided<[16777216, 4096, 1], offset: ?>>, memref<1x4096x8xf32, strided<[163840, 40, 1], offset: ?>>) outs(%subview : memref<1x512x8xf32, strided<[163840, 40, 1], offset: ?>>)
}
- return
}
}
+ return
}
}
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir
index c999237..020dccd 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir
@@ -1,5 +1,5 @@
-// RUN: iree-opt --split-input-file --iree-spirv-vectorize-load-store --canonicalize -cse --mlir-print-local-scope %s | FileCheck %s
-// RUN: iree-opt --split-input-file --iree-spirv-vectorize-load-store --cse --mlir-print-local-scope %s | FileCheck %s --check-prefix=BASE
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-spirv-vectorize-load-store, canonicalize, cse))" --mlir-print-local-scope %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-spirv-vectorize-load-store, cse))" --mlir-print-local-scope %s | FileCheck %s --check-prefix=BASE
func.func @alloc_transfer_read_write_vector4_vector8(%arg0: memref<4096x4096xf32>, %x: index, %y: index) {
%cst = arith.constant 0.000000e+00 : f32
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
index c12ff9a..e8ed371 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
@@ -362,8 +362,8 @@
b.create<IREE::transform_dialect::
ApplyFoldTensorSliceIntoTransferPatternsOp>(loc);
});
- b.create<IREEEliminateEmptyTensorsOp>(variantH);
- variantH = b.create<IREEBufferizeOp>(variantH, targetGpu);
+ b.create<IREEEliminateEmptyTensorsOp>(funcH);
+ variantH = b.create<IREEBufferizeOp>(funcH, targetGpu);
return variantH;
}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp
index f877ed5..9272403 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp
@@ -688,8 +688,8 @@
});
b.create<IREE::transform_dialect::ApplyLoopIndependentCodeMotionOp>(funcH);
b.create<mlir::transform::ApplyCommonSubexpressionEliminationOp>(funcH);
- b.create<IREEEliminateEmptyTensorsOp>(variantH);
- auto bufferizeOp = b.create<IREEBufferizeOp>(variantH, /*targetGpu=*/true);
+ b.create<IREEEliminateEmptyTensorsOp>(funcH);
+ auto bufferizeOp = b.create<IREEBufferizeOp>(funcH, /*targetGpu=*/true);
bufferizeOp.setTargetGpu(true);
variantH = bufferizeOp.getResult();
Value memrefFunc =
diff --git a/compiler/src/iree/compiler/Codegen/Transforms/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Transforms/Transforms.cpp
index 29b82e5..c6deeae 100644
--- a/compiler/src/iree/compiler/Codegen/Transforms/Transforms.cpp
+++ b/compiler/src/iree/compiler/Codegen/Transforms/Transforms.cpp
@@ -438,9 +438,9 @@
LogicalResult lowerWorkgroupCountFromSliceOp(
RewriterBase &rewriter, mlir::FunctionOpInterface entryPointFn,
ArrayRef<OpFoldResult> workgroupCount, int maxWorkgroupParallelDims) {
- FailureOr<IREE::HAL::ExecutableExportOp> exportOp =
+ std::optional<IREE::HAL::ExecutableExportOp> exportOp =
getEntryPoint(entryPointFn);
- if (failed(exportOp)) {
+ if (!exportOp) {
return entryPointFn.emitOpError(
"expected function to be entry point function");
}
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
index 181af75..152772e 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
@@ -83,32 +83,6 @@
return procInfo;
}
-std::array<int64_t, 3> getWorkgroupSize(mlir::FunctionOpInterface funcOp) {
- std::array<int64_t, 3> workgroupSize;
- FailureOr<IREE::HAL::ExecutableExportOp> exportOp =
- mlir::iree_compiler::getEntryPoint(funcOp);
- std::optional<mlir::ArrayAttr> workgroupSizeAttr =
- exportOp->getWorkgroupSize();
- assert(workgroupSizeAttr.has_value());
- for (auto [index, attr] : llvm::enumerate(workgroupSizeAttr.value())) {
- workgroupSize[index] =
- llvm::cast<mlir::IntegerAttr>(attr).getValue().getZExtValue();
- }
- return workgroupSize;
-}
-
-std::optional<int64_t> getSubgroupSize(mlir::FunctionOpInterface funcOp) {
- FailureOr<IREE::HAL::ExecutableExportOp> exportOp =
- mlir::iree_compiler::getEntryPoint(funcOp);
- if (failed(exportOp)) {
- return std::nullopt;
- }
- if (IntegerAttr attr = exportOp->getSubgroupSizeAttr()) {
- return attr.getValue().getSExtValue();
- }
- return std::nullopt;
-}
-
//===----------------------------------------------------------------------===//
// GPU vectorization
//===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
index 095f24d..8768f63 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
@@ -35,12 +35,6 @@
getSubgroupIdsAndCounts(OpBuilder &builder, Location loc, unsigned warpSize,
unsigned numDims, llvm::ArrayRef<int64_t> numSubgroups);
-/// Returns the workgroup size associated to the funcOp entry point.
-std::array<int64_t, 3> getWorkgroupSize(mlir::FunctionOpInterface funcOp);
-
-/// Returns the subgroup size associated to the funcOp entry point.
-std::optional<int64_t> getSubgroupSize(mlir::FunctionOpInterface funcOp);
-
//===----------------------------------------------------------------------===//
// GPU vectorization
//===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
index c4a2460..7217a56 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
@@ -35,43 +35,23 @@
// Utility functions to get entry points
//===----------------------------------------------------------------------===//
-FailureOr<IREE::HAL::ExecutableExportOp>
+std::optional<IREE::HAL::ExecutableExportOp>
getEntryPoint(mlir::FunctionOpInterface funcOp) {
auto variantOp = funcOp->getParentOfType<IREE::HAL::ExecutableVariantOp>();
- if (!variantOp)
- return failure();
+ if (!variantOp) {
+ return std::nullopt;
+ }
for (auto op : variantOp.getExportOps()) {
if (op.getSymName() == funcOp.getName()) {
return op;
}
}
- return failure();
-}
-
-FailureOr<IREE::HAL::ExecutableVariantOp>
-getExecutableVariantOp(Operation *op) {
- if (auto result = dyn_cast<IREE::HAL::ExecutableVariantOp>(op)) {
- return result;
- }
- if (auto result = op->getParentOfType<IREE::HAL::ExecutableVariantOp>()) {
- return result;
- }
- return failure();
+ return std::nullopt;
}
bool isEntryPoint(mlir::FunctionOpInterface func) {
- return func.isPublic() && succeeded(getEntryPoint(func));
-}
-
-llvm::StringMap<IREE::HAL::ExecutableExportOp>
-getAllEntryPoints(ModuleOp module) {
- auto variantOp = module->getParentOfType<IREE::HAL::ExecutableVariantOp>();
- llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps;
- for (auto op : variantOp.getExportOps()) {
- exportOps[op.getSymName()] = op;
- }
- return exportOps;
+ return func.isPublic() && getEntryPoint(func);
}
std::optional<StringAttr>
@@ -1122,8 +1102,8 @@
/// Infer the number of workgroups from exportOp.
SmallVector<int64_t> getStaticNumWorkgroups(mlir::FunctionOpInterface funcOp) {
SmallVector<int64_t> result;
- FailureOr<IREE::HAL::ExecutableExportOp> exportOp = getEntryPoint(funcOp);
- if (failed(exportOp))
+ std::optional<IREE::HAL::ExecutableExportOp> exportOp = getEntryPoint(funcOp);
+ if (!exportOp)
return result;
Block *body = exportOp->getWorkgroupCountBody();
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
index e146572..7c34200 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/Utils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
@@ -33,18 +33,10 @@
/// Returns true if the given `func` is a kernel dispatch entry point.
bool isEntryPoint(mlir::FunctionOpInterface func);
-/// Returns a map from function symbol name to corresponding entry point op.
-llvm::StringMap<IREE::HAL::ExecutableExportOp>
-getAllEntryPoints(ModuleOp module);
-
/// Returns the entry point op for the `funcOp`. Returns `nullptr` on failure.
-FailureOr<IREE::HAL::ExecutableExportOp>
+std::optional<IREE::HAL::ExecutableExportOp>
getEntryPoint(mlir::FunctionOpInterface funcOp);
-/// Returns the ExecutableVariableOp enclosing `op`. Returns `nullptr` on
-/// failure.
-FailureOr<IREE::HAL::ExecutableVariantOp> getExecutableVariantOp(Operation *op);
-
/// Returns the StringAttr with the name `stringAttr` in the `targetAttr`, if
/// found.
std::optional<StringAttr>
diff --git a/compiler/src/iree/compiler/Codegen/VMVX/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/VMVX/KernelDispatch.cpp
index 762f1cd..cfb5758 100644
--- a/compiler/src/iree/compiler/Codegen/VMVX/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/VMVX/KernelDispatch.cpp
@@ -117,22 +117,13 @@
return success();
}
-LogicalResult initVMVXLaunchConfig(ModuleOp moduleOp) {
- llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps =
- getAllEntryPoints(moduleOp);
- for (auto funcOp : moduleOp.getOps<mlir::FunctionOpInterface>()) {
- auto exportOp = exportOps.lookup(funcOp.getName());
- if (!exportOp) {
- continue;
- }
+LogicalResult initVMVXLaunchConfig(FunctionOpInterface funcOp) {
+ if (getTranslationInfo(funcOp)) {
+ return success();
+ }
- if (getTranslationInfo(exportOp)) {
- continue;
- }
-
- if (failed(setConfigForKernel(funcOp))) {
- return failure();
- }
+ if (failed(setConfigForKernel(funcOp))) {
+ return failure();
}
return success();
diff --git a/compiler/src/iree/compiler/Codegen/VMVX/KernelDispatch.h b/compiler/src/iree/compiler/Codegen/VMVX/KernelDispatch.h
index 57a1d0a..adf009d 100644
--- a/compiler/src/iree/compiler/Codegen/VMVX/KernelDispatch.h
+++ b/compiler/src/iree/compiler/Codegen/VMVX/KernelDispatch.h
@@ -8,11 +8,11 @@
#define IREE_COMPILER_CODEGEN_VMVX_KERNELDISPATCH_H_
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
-#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
namespace mlir::iree_compiler {
-LogicalResult initVMVXLaunchConfig(ModuleOp moduleOp);
+LogicalResult initVMVXLaunchConfig(FunctionOpInterface funcOp);
} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/VMVX/Passes.cpp b/compiler/src/iree/compiler/Codegen/VMVX/Passes.cpp
index 4ba9e57..a358f7b 100644
--- a/compiler/src/iree/compiler/Codegen/VMVX/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/VMVX/Passes.cpp
@@ -35,34 +35,27 @@
"ukernels are enabled (experimental)"),
llvm::cl::init(true));
-static void addTileAndDistributePasses(OpPassManager &pm) {
- pm.addPass(createTileAndDistributeToWorkgroupsPass());
- auto &nestedModulePM = pm.nest<ModuleOp>();
- nestedModulePM.addNestedPass<func::FuncOp>(
- createConvertToDestinationPassingStylePass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createFoldAffineMinInDistributedLoopsPass());
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createFuseTensorPadWithConsumerPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createConcretizePadResultShapePass());
- nestedModulePM.addNestedPass<func::FuncOp>(
+static void addTileAndDistributePasses(OpPassManager &funcPassManager) {
+ funcPassManager.addPass(createTileAndDistributeToWorkgroupsPass());
+ funcPassManager.addPass(createConvertToDestinationPassingStylePass());
+ funcPassManager.addPass(createFoldAffineMinInDistributedLoopsPass());
+ funcPassManager.addPass(createCanonicalizerPass());
+ funcPassManager.addPass(createCSEPass());
+ funcPassManager.addPass(createFuseTensorPadWithConsumerPass());
+ funcPassManager.addPass(createConcretizePadResultShapePass());
+ funcPassManager.addPass(
IREE::LinalgExt::createTileAndDecomposeAttentionPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
+ funcPassManager.addPass(
IREE::LinalgExt::createTileAndDecomposeWinogradTransformPass());
}
-void addVMVXDefaultPassPipeline(OpPassManager &passManager,
+void addVMVXDefaultPassPipeline(OpPassManager &funcPassManager,
bool enableUKernels) {
- addTileAndDistributePasses(passManager);
+ addTileAndDistributePasses(funcPassManager);
- OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
if (enableUKernels) {
- nestedModulePM.addNestedPass<func::FuncOp>(
- createDecomposeBatchMmt4DOpsPass());
- nestedModulePM.addPass(
+ funcPassManager.addPass(createDecomposeBatchMmt4DOpsPass());
+ funcPassManager.addPass(
createCPULowerToUKernelsPass(clSkipIntermediateRoundings));
}
@@ -70,37 +63,33 @@
// Note that this must be done post-tiling because it changes the structure
// of the dispatch region such that tiling is not always possible.
if (enableUKernels && clEnableUKernelsDecomposeLinalgGeneric) {
- passManager.nest<ModuleOp>().nest<func::FuncOp>().addPass(
- createDecomposeLinalgGenericPass());
+ funcPassManager.addPass(createDecomposeLinalgGenericPass());
}
// Lower to buffers.
- addCPUBufferizePasses(nestedModulePM);
+ addCPUBufferizePasses(funcPassManager);
// Cleanup the IR that may now have unused loops.
- nestedModulePM.addNestedPass<func::FuncOp>(
- createRemoveSingleIterationLoopPass());
+ funcPassManager.addPass(createRemoveSingleIterationLoopPass());
// Convert buffer-level microkernels.
if (enableUKernels) {
- nestedModulePM.addPass(createLowerUKernelOpsToCallsPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- createVMVXLowerLinalgMicrokernelsPass());
+ funcPassManager.addPass(createVMVXLowerLinalgMicrokernelsPass());
}
}
// NOTE: this runs on the top-level program module containing all
// hal.executable ops.
-void buildVMVXLinkingPassPipeline(OpPassManager &passManager) {
+void buildVMVXLinkingPassPipeline(OpPassManager &modulePassManager) {
// Link together executables. This may produce some IR duplication.
- passManager.addPass(createVMVXLinkExecutablesPass());
+ modulePassManager.addPass(createVMVXLinkExecutablesPass());
// Cleanup IR duplication.
- passManager.addNestedPass<IREE::HAL::ExecutableOp>(
+ modulePassManager.addNestedPass<IREE::HAL::ExecutableOp>(
mlir::createCanonicalizerPass());
// Assign final executable constant ordinals.
- passManager.nest<IREE::HAL::ExecutableOp>()
+ modulePassManager.nest<IREE::HAL::ExecutableOp>()
.addNestedPass<IREE::HAL::ExecutableVariantOp>(
createVMVXAssignConstantOrdinalsPass());
}
@@ -121,8 +110,8 @@
static PassPipelineRegistration<> VMVXLinkingPipeline(
"iree-codegen-vmvx-linking-pipeline",
"Runs the VMVX HAL executable linking pipeline",
- [](OpPassManager &passManager) {
- buildVMVXLinkingPassPipeline(passManager);
+ [](OpPassManager &modulePassManager) {
+ buildVMVXLinkingPassPipeline(modulePassManager);
});
}
diff --git a/compiler/src/iree/compiler/Codegen/VMVX/Passes.h b/compiler/src/iree/compiler/Codegen/VMVX/Passes.h
index 4fd7ae4..63aa1db 100644
--- a/compiler/src/iree/compiler/Codegen/VMVX/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/VMVX/Passes.h
@@ -32,17 +32,16 @@
createVMVXMaterializeEncodingPass();
/// Pass to select a lowering strategy for a hal.executable.variant operation.
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
-createVMVXSelectLoweringStrategyPass();
+std::unique_ptr<OperationPass<ModuleOp>> createVMVXSelectLoweringStrategyPass();
/// Pass to lower the module an hal.executable.variant operation to external
/// dialect.
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createVMVXLowerExecutableTargetPass();
/// Populates the passes to lower to tiled/distributed/bufferized ops,
/// suitable for library call dispatch and lowering to loops.
-void addVMVXDefaultPassPipeline(OpPassManager &passManager,
+void addVMVXDefaultPassPipeline(OpPassManager &funcPassManager,
bool enableUKernels);
//----------------------------------------------------------------------------//
@@ -57,7 +56,7 @@
std::unique_ptr<OperationPass<mlir::ModuleOp>> createVMVXLinkExecutablesPass();
/// Populates passes needed to link HAL executables across VMVX targets.
-void buildVMVXLinkingPassPipeline(OpPassManager &passManager);
+void buildVMVXLinkingPassPipeline(OpPassManager &variantPassManager);
//----------------------------------------------------------------------------//
// Register VMVX Passes
diff --git a/compiler/src/iree/compiler/Codegen/VMVX/Passes.td b/compiler/src/iree/compiler/Codegen/VMVX/Passes.td
index 2d0bd1d..b8b703a 100644
--- a/compiler/src/iree/compiler/Codegen/VMVX/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/VMVX/Passes.td
@@ -20,8 +20,7 @@
}
def VMVXSelectLoweringStrategy :
- Pass<"iree-vmvx-select-lowering-strategy",
- "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> {
+ Pass<"iree-vmvx-select-lowering-strategy", "ModuleOp"> {
let summary =
"Select a IREE::HAL::DispatchLoweringPassPipeline for lowering the variant";
let constructor =
@@ -35,8 +34,7 @@
}
def VMVXLowerExecutableTarget :
- Pass<"iree-vmvx-lower-executable-target",
- "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> {
+ InterfacePass<"iree-vmvx-lower-executable-target", "mlir::FunctionOpInterface"> {
let summary =
"Lower executable target using an IREE::HAL::DispatchLoweringPassPipeline";
let constructor =
diff --git a/compiler/src/iree/compiler/Codegen/VMVX/VMVXLowerExecutableTargetPass.cpp b/compiler/src/iree/compiler/Codegen/VMVX/VMVXLowerExecutableTargetPass.cpp
index 51d2fb9..0eddc3a 100644
--- a/compiler/src/iree/compiler/Codegen/VMVX/VMVXLowerExecutableTargetPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/VMVX/VMVXLowerExecutableTargetPass.cpp
@@ -4,6 +4,7 @@
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "iree/compiler/Codegen/Common/PassUtils.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
#include "iree/compiler/Codegen/VMVX/PassDetail.h"
@@ -11,6 +12,7 @@
#include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
+#include "llvm/Support/Debug.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -18,6 +20,8 @@
#include "mlir/Pass/PassManager.h"
#include "mlir/Pass/PassRegistry.h"
+#define DEBUG_TYPE "iree-vmvx-lower-executable-target"
+
using mlir::iree_compiler::IREE::Codegen::LoweringConfigAttr;
namespace mlir::iree_compiler {
@@ -47,40 +51,45 @@
} // namespace
void VMVXLowerExecutableTargetPass::runOnOperation() {
- IREE::HAL::ExecutableVariantOp variantOp = getOperation();
+ auto funcOp = getOperation();
- std::optional<IREE::Codegen::TranslationInfoAttr> translationInfo =
- getIdenticalTranslationInfo(variantOp);
- if (!translationInfo) {
- variantOp.emitOpError(
- "unhandled compilation of entry point functions with different "
- "translation info");
+ auto translationInfo = getTranslationInfo(funcOp);
+ if (!translationInfo)
+ return;
+
+ std::optional<OpPassManager> maybePipeline =
+ getFunctionOpInterfacePassManager(funcOp);
+ if (!maybePipeline) {
+ funcOp.emitOpError(
+ "unhandled function-like container during executable lowering");
+ return signalPassFailure();
+ }
+ OpPassManager &pipeline = maybePipeline.value();
+
+ auto target = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
+ bool enableUKernels = target && hasUkernel(target);
+ switch (translationInfo.getDispatchLoweringPassPipeline()) {
+ // No pipleline specified, nothing to do.
+ case IREE::Codegen::DispatchLoweringPassPipeline::None:
+ return;
+ case IREE::Codegen::DispatchLoweringPassPipeline::VMVXDefault:
+ addVMVXDefaultPassPipeline(pipeline, enableUKernels);
+ break;
+ default:
+ funcOp.emitOpError("Unsupported pipeline on VMVX target.");
return signalPassFailure();
}
- OpPassManager pipeline(IREE::HAL::ExecutableVariantOp::getOperationName());
- if (translationInfo.has_value()) {
- auto target = variantOp.getTarget();
- bool enableUKernels = hasUkernel(target);
- switch (translationInfo.value().getDispatchLoweringPassPipeline()) {
- // No pipleline specified, nothing to do.
- case IREE::Codegen::DispatchLoweringPassPipeline::None:
- return;
- case IREE::Codegen::DispatchLoweringPassPipeline::VMVXDefault:
- addVMVXDefaultPassPipeline(pipeline, enableUKernels);
- break;
- default:
- variantOp.emitOpError("Unsupported pipeline on VMVX target.");
- return signalPassFailure();
- }
- }
-
- if (failed(runPipeline(pipeline, variantOp))) {
+ LLVM_DEBUG({
+ llvm::dbgs() << "Using Pass pipeline : ";
+ pipeline.dump();
+ });
+ if (failed(runPipeline(pipeline, funcOp))) {
return signalPassFailure();
}
}
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<InterfacePass<FunctionOpInterface>>
createVMVXLowerExecutableTargetPass() {
return std::make_unique<VMVXLowerExecutableTargetPass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/VMVX/VMVXSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/VMVX/VMVXSelectLoweringStrategy.cpp
index afb0af0..5b25180 100644
--- a/compiler/src/iree/compiler/Codegen/VMVX/VMVXSelectLoweringStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/VMVX/VMVXSelectLoweringStrategy.cpp
@@ -50,25 +50,17 @@
} // namespace
void VMVXSelectLoweringStrategyPass::runOnOperation() {
- IREE::HAL::ExecutableVariantOp variantOp = getOperation();
- ModuleOp moduleOp = variantOp.getInnerModule();
-
- // Set the strategy with default heuristics.
- if (failed(initVMVXLaunchConfig(moduleOp))) {
- return signalPassFailure();
- }
-
- std::optional<IREE::Codegen::TranslationInfoAttr> translationInfo =
- getIdenticalTranslationInfo(variantOp);
- if (!translationInfo) {
- moduleOp.emitOpError(
- "unhandled compilation of entry point functions with different "
- "translation info");
- return signalPassFailure();
+ auto moduleOp = getOperation();
+ for (auto funcOp : moduleOp.getOps<FunctionOpInterface>()) {
+ // Set the strategy with default heuristics.
+ if (failed(initVMVXLaunchConfig(funcOp))) {
+ funcOp.emitOpError("failed to set lowering configuration");
+ return signalPassFailure();
+ }
}
}
-std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
createVMVXSelectLoweringStrategyPass() {
return std::make_unique<VMVXSelectLoweringStrategyPass>();
}
diff --git a/compiler/src/iree/compiler/Codegen/VMVX/test/pipeline.mlir b/compiler/src/iree/compiler/Codegen/VMVX/test/pipeline.mlir
index ecdc60b..2002466 100644
--- a/compiler/src/iree/compiler/Codegen/VMVX/test/pipeline.mlir
+++ b/compiler/src/iree/compiler/Codegen/VMVX/test/pipeline.mlir
@@ -1,43 +1,35 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-vmvx-select-lowering-strategy, iree-vmvx-lower-executable-target)))" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-vmvx-select-lowering-strategy, func.func(iree-vmvx-lower-executable-target))" --split-input-file %s | FileCheck %s
+#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "all"}>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
-hal.executable private @mmt4d_ukernel {
- hal.executable.variant public @vmvx_bytecode_fb target(<"vmvx", "vmvx-bytecode-fb", {ukernels = "all"}>) {
- hal.executable.export public @mmt4d_i8 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @mmt4d_i8() {
- %c0 = arith.constant 0 : index
- %c256 = arith.constant 256 : index
- %c512 = arith.constant 512 : index
- %c16 = arith.constant 16 : index
- %0:2 = iree_codegen.query_tile_sizes tensor<16x16xi8, #iree_linalg_ext.encoding<role = LHS, element_types = [i8, i8, i32], user_indexing_maps = [#map, #map1, #map2]>> -> index, index
- %1 = affine.apply affine_map<()[s0] -> (16 ceildiv s0)>()[%0#0]
- %2 = affine.apply affine_map<()[s0] -> (16 ceildiv s0)>()[%0#1]
- %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xi8>>{%1, %2, %0#0, %0#1}
- %4:2 = iree_codegen.query_tile_sizes tensor<16x16xi8, #iree_linalg_ext.encoding<role = RHS, element_types = [i8, i8, i32], user_indexing_maps = [#map, #map1, #map2]>> -> index, index
- %5 = affine.apply affine_map<()[s0] -> (16 ceildiv s0)>()[%4#0]
- %6 = affine.apply affine_map<()[s0] -> (16 ceildiv s0)>()[%4#1]
- %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c256) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xi8>>{%5, %6, %4#0, %4#1}
- %8:2 = iree_codegen.query_tile_sizes tensor<16x16xi32, #iree_linalg_ext.encoding<role = RESULT, element_types = [i8, i8, i32], user_indexing_maps = [#map, #map1, #map2]>> -> index, index
- %9 = affine.apply affine_map<()[s0] -> (16 ceildiv s0)>()[%8#0]
- %10 = affine.apply affine_map<()[s0] -> (16 ceildiv s0)>()[%8#1]
- %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c512) : !flow.dispatch.tensor<readwrite:tensor<?x?x?x?xi32>>{%9, %10, %8#0, %8#1}
- %15 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [%1, %2, %0#0, %0#1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xi8>>{%1, %2, %0#0, %0#1} -> tensor<?x?x?x?xi8>
- %19 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%5, %6, %4#0, %4#1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xi8>>{%5, %6, %4#0, %4#1} -> tensor<?x?x?x?xi8>
- %23 = flow.dispatch.tensor.load %11, offsets = [0, 0, 0, 0], sizes = [%9, %10, %8#0, %8#1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x?x?xi32>>{%9, %10, %8#0, %8#1} -> tensor<?x?x?x?xi32>
- %24 = linalg.mmt4d ins(%15, %19 : tensor<?x?x?x?xi8>, tensor<?x?x?x?xi8>) outs(%23 : tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32>
- flow.dispatch.tensor.store %24, %11, offsets = [0, 0, 0, 0], sizes = [%9, %10, %8#0, %8#1], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x?x?xi32>>{%9, %10, %8#0, %8#1}
- return
- }
- }
+#map3 = affine_map<()[s0] -> (16 ceildiv s0)>
+module {
+ func.func @mmt4d_i8() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} {
+ %c0 = arith.constant 0 : index
+ %c256 = arith.constant 256 : index
+ %c512 = arith.constant 512 : index
+ %c16 = arith.constant 16 : index
+ %0:2 = iree_codegen.query_tile_sizes tensor<16x16xi8, #iree_linalg_ext.encoding<role = LHS, element_types = [i8, i8, i32], user_indexing_maps = [#map, #map1, #map2]>> -> index, index
+ %1 = affine.apply #map3()[%0#0]
+ %2 = affine.apply #map3()[%0#1]
+ %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xi8>>{%1, %2, %0#0, %0#1}
+ %4:2 = iree_codegen.query_tile_sizes tensor<16x16xi8, #iree_linalg_ext.encoding<role = RHS, element_types = [i8, i8, i32], user_indexing_maps = [#map, #map1, #map2]>> -> index, index
+ %5 = affine.apply #map3()[%4#0]
+ %6 = affine.apply #map3()[%4#1]
+ %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c256) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xi8>>{%5, %6, %4#0, %4#1}
+ %8:2 = iree_codegen.query_tile_sizes tensor<16x16xi32, #iree_linalg_ext.encoding<role = RESULT, element_types = [i8, i8, i32], user_indexing_maps = [#map, #map1, #map2]>> -> index, index
+ %9 = affine.apply #map3()[%8#0]
+ %10 = affine.apply #map3()[%8#1]
+ %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c512) : !flow.dispatch.tensor<readwrite:tensor<?x?x?x?xi32>>{%9, %10, %8#0, %8#1}
+ %12 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [%1, %2, %0#0, %0#1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xi8>>{%1, %2, %0#0, %0#1} -> tensor<?x?x?x?xi8>
+ %13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%5, %6, %4#0, %4#1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xi8>>{%5, %6, %4#0, %4#1} -> tensor<?x?x?x?xi8>
+ %14 = flow.dispatch.tensor.load %11, offsets = [0, 0, 0, 0], sizes = [%9, %10, %8#0, %8#1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:tensor<?x?x?x?xi32>>{%9, %10, %8#0, %8#1} -> tensor<?x?x?x?xi32>
+ %15 = linalg.mmt4d ins(%12, %13 : tensor<?x?x?x?xi8>, tensor<?x?x?x?xi8>) outs(%14 : tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32>
+ flow.dispatch.tensor.store %15, %11, offsets = [0, 0, 0, 0], sizes = [%9, %10, %8#0, %8#1], strides = [1, 1, 1, 1] : tensor<?x?x?x?xi32> -> !flow.dispatch.tensor<readwrite:tensor<?x?x?x?xi32>>{%9, %10, %8#0, %8#1}
+ return
}
}
-// CHECK: func private @vmvx.mmt4d(
// CHECK: func @mmt4d_i8()
-// CHECK: func.call @vmvx.mmt4d(
+// CHECK: iree_codegen.ukernel.generic "vmvx.mmt4d"
diff --git a/compiler/src/iree/compiler/Codegen/VMVX/test/select_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/VMVX/test/select_lowering_strategy.mlir
index a330e07..bf5f372 100644
--- a/compiler/src/iree/compiler/Codegen/VMVX/test/select_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/VMVX/test/select_lowering_strategy.mlir
@@ -1,304 +1,236 @@
-// RUN: iree-opt -pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-vmvx-select-lowering-strategy)))' -split-input-file %s | FileCheck %s
+// RUN: iree-opt -pass-pipeline='builtin.module(iree-vmvx-select-lowering-strategy)' -split-input-file %s | FileCheck %s
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>
- ]>
-]>
-hal.executable private @matmul_static {
- hal.executable.variant @vmvx_bytecode_fb target(<"vmvx", "vmvx-bytecode-fb">) {
- hal.executable.export public @matmul_static layout(#pipeline_layout)
- builtin.module {
- func.func @matmul_static() {
- %cst = arith.constant 0.0 : f32
- %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<384x512xf32>>
- %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x128xf32>>
- %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
- %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [384, 512], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<384x512xf32>> -> tensor<384x512xf32>
- %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [512, 128], strides = [1, 1]
- : !flow.dispatch.tensor<readonly:tensor<512x128xf32>> -> tensor<512x128xf32>
- %init = tensor.empty() : tensor<384x128xf32>
- %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<384x128xf32>) -> tensor<384x128xf32>
- %gemm = linalg.matmul ins(%lhs, %rhs : tensor<384x512xf32>, tensor<512x128xf32>)
- outs(%fill : tensor<384x128xf32>) -> tensor<384x128xf32>
- flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [384, 128], strides = [1, 1]
- : tensor<384x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
- return
- }
- }
+#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb">
+module {
+ func.func @matmul_static() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<384x512xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x128xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<384x512xf32>> -> tensor<384x512xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<512x128xf32>> -> tensor<512x128xf32>
+ %5 = tensor.empty() : tensor<384x128xf32>
+ %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<384x128xf32>) -> tensor<384x128xf32>
+ %7 = linalg.matmul ins(%3, %4 : tensor<384x512xf32>, tensor<512x128xf32>) outs(%6 : tensor<384x128xf32>) -> tensor<384x128xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
+ return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<VMVXDefault>
-// CHECK: hal.executable.export public @matmul_static
+// CHECK: func.func @matmul_static
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable @copy_op_dynamic {
- hal.executable.variant @vmvx_bytecode_fb target(<"vmvx", "vmvx-bytecode-fb">) {
- hal.executable.export @copy_op_dynamic layout(#pipeline_layout)
- builtin.module {
- func.func @copy_op_dynamic() {
- %d0 = hal.interface.constant.load[0] : index
- %d1 = hal.interface.constant.load[1] : index
- %d2 = hal.interface.constant.load[2] : index
- %d3 = hal.interface.constant.load[3] : index
- %o0 = hal.interface.constant.load[4] : index
- %o1 = hal.interface.constant.load[5] : index
- %source = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<?x?xi32>{%d0, %d1}
- %dest = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<?x?xi32>{%d2, %d3}
- %dest_view = memref.subview %dest[%o0, %o1] [%d0, %d1] [1, 1] : memref<?x?xi32> to memref<?x?xi32, strided<[?, 1], offset : ?>>
- linalg.generic {
- indexing_maps = [affine_map<(d0, d1) -> (d0, d1)> , affine_map<(d0, d1) -> (d0, d1)>],
- iterator_types = ["parallel", "parallel"]}
- ins(%source : memref<?x?xi32>) outs(%dest_view : memref<?x?xi32, strided<[?, 1], offset : ?>>) {
- ^bb0(%arg0 : i32, %arg1 : i32):
- linalg.yield %arg0 : i32
- }
- return
- }
+#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb">
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @copy_op_dynamic() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} {
+ %0 = hal.interface.constant.load[0] : index
+ %1 = hal.interface.constant.load[1] : index
+ %2 = hal.interface.constant.load[2] : index
+ %3 = hal.interface.constant.load[3] : index
+ %4 = hal.interface.constant.load[4] : index
+ %5 = hal.interface.constant.load[5] : index
+ %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<?x?xi32>{%0, %1}
+ %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<?x?xi32>{%2, %3}
+ %subview = memref.subview %7[%4, %5] [%0, %1] [1, 1] : memref<?x?xi32> to memref<?x?xi32, strided<[?, 1], offset: ?>>
+ linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%6 : memref<?x?xi32>) outs(%subview : memref<?x?xi32, strided<[?, 1], offset: ?>>) {
+ ^bb0(%in: i32, %out: i32):
+ linalg.yield %in : i32
}
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<VMVXDefault>
-// CHECK: hal.executable.export public @copy_op_dynamic
+// CHECK: func.func @copy_op_dynamic
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @static_1d_fft_stage2 {
- hal.executable.variant @vmvx_bytecode_fb target(<"vmvx", "vmvx-bytecode-fb">) {
- hal.executable.export @static_1d_fft_stage2 layout(#pipeline_layout)
- builtin.module {
- func.func @static_1d_fft_stage2() {
- %c0 = arith.constant 0 : index
- %c2 = arith.constant 2 : index
- %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32>
- %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32>
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
- %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
- %4:2 = iree_linalg_ext.fft ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32>
- flow.dispatch.tensor.store %4#0, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- flow.dispatch.tensor.store %4#1, %1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
- return
- }
- }
+#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb">
+module {
+ func.func @static_1d_fft_stage2() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} {
+ %c0 = arith.constant 0 : index
+ %c2 = arith.constant 2 : index
+ %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32>
+ %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32>
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
+ %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<32xf32>> -> tensor<32xf32>
+ %4:2 = iree_linalg_ext.fft ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32>
+ flow.dispatch.tensor.store %4#0, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ flow.dispatch.tensor.store %4#1, %1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<readwrite:tensor<32xf32>>
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64]{{\]}}>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<VMVXDefault>
-// CHECK: hal.executable.export public @static_1d_fft_stage2
+// CHECK: func.func @static_1d_fft_stage2
// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK: func.func @static_1d_fft_stage2()
// CHECK: iree_linalg_ext.fft
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>,
- #hal.descriptor_set.binding<2, storage_buffer>,
- #hal.descriptor_set.binding<3, storage_buffer>,
- #hal.descriptor_set.binding<4, storage_buffer>
- ]>
-]>
-hal.executable @fusion_quant_matmul_generic {
- hal.executable.variant @vmvx_bytecode_fb target(<"vmvx", "vmvx-bytecode-fb">) {
- hal.executable.export @fusion_quant_matmul_generic layout(#pipeline_layout)
- builtin.module {
- func.func @fusion_quant_matmul_generic() {
- %c0_i32 = arith.constant 0 : i32
- %c-128_i32 = arith.constant -128 : i32
- %c1101627623_i32 = arith.constant 1101627623 : i32
- %c36_i8 = arith.constant 36 : i8
- %c127_i32 = arith.constant 127 : i32
- %c107520 = arith.constant 107520 : index
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = arith.index_castui %0 : i32 to index
- %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3360x32xi8>>
- %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<32xi32>>
- %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c107520) : !flow.dispatch.tensor<readonly:tensor<32xi32>>
- %5 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x3360xi8>>{%1}
- %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x32xi8>>{%1}
- %7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%1, 3360], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x3360xi8>>{%1} -> tensor<?x3360xi8>
- %8 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [3360, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3360x32xi8>> -> tensor<3360x32xi8>
- %9 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xi32>> -> tensor<32xi32>
- %10 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xi32>> -> tensor<32xi32>
- %11 = tensor.empty(%1) : tensor<?x32xi8>
- %12 = tensor.empty(%1) : tensor<?x32xi32>
- %13 = linalg.fill ins(%c0_i32 : i32) outs(%12 : tensor<?x32xi32>) -> tensor<?x32xi32>
- %14 = linalg.matmul ins(%7, %8 : tensor<?x3360xi8>, tensor<3360x32xi8>) outs(%13 : tensor<?x32xi32>) -> tensor<?x32xi32>
- %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9, %14, %10 : tensor<32xi32>, tensor<?x32xi32>, tensor<32xi32>) outs(%11 : tensor<?x32xi8>) {
- ^bb0(%in: i32, %in_0: i32, %in_1: i32, %out: i8):
- %16 = arith.muli %in_1, %c-128_i32 : i32
- %17 = arith.subi %in_0, %16 : i32
- %18 = arith.addi %in, %17 : i32
- %19 = tosa.apply_scale %18, %c1101627623_i32, %c36_i8 {double_round = true} : (i32, i32, i8) -> i32
- %20 = arith.addi %19, %c-128_i32 : i32
- %21 = arith.cmpi slt, %20, %c-128_i32 : i32
- %22 = arith.select %21, %c-128_i32, %20 : i32
- %23 = arith.cmpi sgt, %20, %c127_i32 : i32
- %24 = arith.select %23, %c127_i32, %22 : i32
- %25 = arith.trunci %24 : i32 to i8
- linalg.yield %25 : i8
- } -> tensor<?x32xi8>
- flow.dispatch.tensor.store %15, %6, offsets = [0, 0], sizes = [%1, 32], strides = [1, 1] : tensor<?x32xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x32xi8>>{%1}
- return
- }
- }
+#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb">
+#map = affine_map<(d0, d1) -> (d1)>
+#map1 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @fusion_quant_matmul_generic() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} {
+ %c0_i32 = arith.constant 0 : i32
+ %c-128_i32 = arith.constant -128 : i32
+ %c1101627623_i32 = arith.constant 1101627623 : i32
+ %c36_i8 = arith.constant 36 : i8
+ %c127_i32 = arith.constant 127 : i32
+ %c107520 = arith.constant 107520 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = arith.index_castui %0 : i32 to index
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<3360x32xi8>>
+ %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<32xi32>>
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c107520) : !flow.dispatch.tensor<readonly:tensor<32xi32>>
+ %5 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x3360xi8>>{%1}
+ %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x32xi8>>{%1}
+ %7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%1, 3360], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x3360xi8>>{%1} -> tensor<?x3360xi8>
+ %8 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [3360, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<3360x32xi8>> -> tensor<3360x32xi8>
+ %9 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xi32>> -> tensor<32xi32>
+ %10 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xi32>> -> tensor<32xi32>
+ %11 = tensor.empty(%1) : tensor<?x32xi8>
+ %12 = tensor.empty(%1) : tensor<?x32xi32>
+ %13 = linalg.fill ins(%c0_i32 : i32) outs(%12 : tensor<?x32xi32>) -> tensor<?x32xi32>
+ %14 = linalg.matmul ins(%7, %8 : tensor<?x3360xi8>, tensor<3360x32xi8>) outs(%13 : tensor<?x32xi32>) -> tensor<?x32xi32>
+ %15 = linalg.generic {indexing_maps = [#map, #map1, #map, #map1], iterator_types = ["parallel", "parallel"]} ins(%9, %14, %10 : tensor<32xi32>, tensor<?x32xi32>, tensor<32xi32>) outs(%11 : tensor<?x32xi8>) {
+ ^bb0(%in: i32, %in_0: i32, %in_1: i32, %out: i8):
+ %16 = arith.muli %in_1, %c-128_i32 : i32
+ %17 = arith.subi %in_0, %16 : i32
+ %18 = arith.addi %in, %17 : i32
+ %19 = tosa.apply_scale %18, %c1101627623_i32, %c36_i8 {double_round = true} : (i32, i32, i8) -> i32
+ %20 = arith.addi %19, %c-128_i32 : i32
+ %21 = arith.cmpi slt, %20, %c-128_i32 : i32
+ %22 = arith.select %21, %c-128_i32, %20 : i32
+ %23 = arith.cmpi sgt, %20, %c127_i32 : i32
+ %24 = arith.select %23, %c127_i32, %22 : i32
+ %25 = arith.trunci %24 : i32 to i8
+ linalg.yield %25 : i8
+ } -> tensor<?x32xi8>
+ flow.dispatch.tensor.store %15, %6, offsets = [0, 0], sizes = [%1, 32], strides = [1, 1] : tensor<?x32xi8> -> !flow.dispatch.tensor<writeonly:tensor<?x32xi8>>{%1}
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<VMVXDefault>
-// CHECK: hal.executable.export public @fusion_quant_matmul_generic
+// CHECK: func.func @fusion_quant_matmul_generic
// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK: func.func @fusion_quant_matmul_generic()
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #[[CONFIG]]
-
// -----
-#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
- #hal.descriptor_set.layout<0, bindings = [
- #hal.descriptor_set.binding<0, storage_buffer>,
- #hal.descriptor_set.binding<1, storage_buffer>
- ]>
-]>
-hal.executable private @unpack_outer_dynamic {
- hal.executable.variant @vmvx_bytecode_fb target(<"vmvx", "vmvx-bytecode-fb">) {
- hal.executable.export public @unpack_outer_dynamic layout(#pipeline_layout)
- builtin.module {
- func.func @unpack_outer_dynamic() {
- %c131072 = arith.constant 131072 : index
- %c0 = arith.constant 0 : index
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = hal.interface.constant.load[2] : i32
- %3 = hal.interface.constant.load[3] : i32
- %4 = arith.index_castui %0 : i32 to index
- %5 = arith.index_castui %1 : i32 to index
- %6 = arith.index_castui %2 : i32 to index
- %7 = arith.index_castui %3 : i32 to index
- %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x?x32x16xi32>>{%4, %5}
- %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c131072) : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%6, %7}
- %10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 32, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x32x16xi32>>{%4, %5} -> tensor<?x?x32x16xi32>
- %11 = tensor.empty(%6, %7) : tensor<?x?xi32>
- %12 = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %11 : tensor<?x?x32x16xi32> -> tensor<?x?xi32>
- flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%6, %7}
- return
- }
- }
+#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb">
+module {
+ func.func @unpack_outer_dynamic() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} {
+ %c131072 = arith.constant 131072 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = arith.index_castui %0 : i32 to index
+ %5 = arith.index_castui %1 : i32 to index
+ %6 = arith.index_castui %2 : i32 to index
+ %7 = arith.index_castui %3 : i32 to index
+ %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x?x32x16xi32>>{%4, %5}
+ %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c131072) : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%6, %7}
+ %10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 32, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x32x16xi32>>{%4, %5} -> tensor<?x?x32x16xi32>
+ %11 = tensor.empty(%6, %7) : tensor<?x?xi32>
+ %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %11 : tensor<?x?x32x16xi32> -> tensor<?x?xi32>
+ flow.dispatch.tensor.store %unpack, %9, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%6, %7}
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<VMVXDefault>
-// CHECK: hal.executable.export public @unpack_outer_dynamic
+// CHECK: func.func @unpack_outer_dynamic
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: tensor.unpack
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-hal.executable private @elem_pack_ukernels {
- hal.executable.variant public @vmvx_bytecode_fb target(<"vmvx", "vmvx-bytecode-fb", {ukernels = true}>) {
- hal.executable.export public @elem_pack_ukernels ordinal(0) layout(#hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @elem_pack_ukernels() {
- %cst = arith.constant 0.000000e+00 : f32
- %c0 = arith.constant 0 : index
- %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x2048xf32>>
- %1:2 = iree_codegen.query_tile_sizes tensor<1024x2048xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], original_type = tensor<1024x2048xf32>>> -> index, index
- %2 = affine.apply affine_map<()[s0] -> (1024 ceildiv s0)>()[%1#0]
- %3 = affine.apply affine_map<()[s0] -> (2048 ceildiv s0)>()[%1#1]
- %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xf32>>{%2, %3, %1#0, %1#1}
- %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x2048xf32>> -> tensor<1024x2048xf32>
- %6 = tensor.empty() : tensor<1024x2048xf32>
- %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<1024x2048xf32>) outs(%6 : tensor<1024x2048xf32>) {
- ^bb0(%in: f32, %out: f32):
- %15 = arith.addf %in, %in : f32
- linalg.yield %15 : f32
- } -> tensor<1024x2048xf32>
- %8:2 = iree_codegen.query_tile_sizes tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], original_type = tensor<1024x2048xf32>>> -> index, index
- %9 = affine.apply affine_map<()[s0] -> (1024 ceildiv s0)>()[%8#0]
- %10 = affine.apply affine_map<()[s0] -> (2048 ceildiv s0)>()[%8#1]
- %11 = tensor.empty(%9, %10, %8#0, %8#1) : tensor<?x?x?x?xf32>
- %pack = tensor.pack %7 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [%8#0, %8#1] into %11 : tensor<1024x2048xf32> -> tensor<?x?x?x?xf32>
- %12:2 = iree_codegen.query_tile_sizes tensor<1024x2048xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], original_type = tensor<1024x2048xf32>>> -> index, index
- %13 = affine.apply affine_map<()[s0] -> (1024 ceildiv s0)>()[%12#0]
- %14 = affine.apply affine_map<()[s0] -> (2048 ceildiv s0)>()[%12#1]
- flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%13, %14, %12#0, %12#1], strides = [1, 1, 1, 1] : tensor<?x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xf32>>{%13, %14, %12#0, %12#1}
- return
- }
- }
+#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = true}>
+#map = affine_map<()[s0] -> (1024 ceildiv s0)>
+#map1 = affine_map<()[s0] -> (2048 ceildiv s0)>
+#map2 = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @elem_pack_ukernels() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1024x2048xf32>>
+ %1:2 = iree_codegen.query_tile_sizes tensor<1024x2048xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], original_type = tensor<1024x2048xf32>>> -> index, index
+ %2 = affine.apply #map()[%1#0]
+ %3 = affine.apply #map1()[%1#1]
+ %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xf32>>{%2, %3, %1#0, %1#1}
+ %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x2048xf32>> -> tensor<1024x2048xf32>
+ %6 = tensor.empty() : tensor<1024x2048xf32>
+ %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<1024x2048xf32>) outs(%6 : tensor<1024x2048xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %15 = arith.addf %in, %in : f32
+ linalg.yield %15 : f32
+ } -> tensor<1024x2048xf32>
+ %8:2 = iree_codegen.query_tile_sizes tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], original_type = tensor<1024x2048xf32>>> -> index, index
+ %9 = affine.apply #map()[%8#0]
+ %10 = affine.apply #map1()[%8#1]
+ %11 = tensor.empty(%9, %10, %8#0, %8#1) : tensor<?x?x?x?xf32>
+ %pack = tensor.pack %7 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [%8#0, %8#1] into %11 : tensor<1024x2048xf32> -> tensor<?x?x?x?xf32>
+ %12:2 = iree_codegen.query_tile_sizes tensor<1024x2048xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], original_type = tensor<1024x2048xf32>>> -> index, index
+ %13 = affine.apply #map()[%12#0]
+ %14 = affine.apply #map1()[%12#1]
+ flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [%13, %14, %12#0, %12#1], strides = [1, 1, 1, 1] : tensor<?x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xf32>>{%13, %14, %12#0, %12#1}
+ return
}
}
+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<VMVXDefault>
-// CHECK: hal.executable.export public @elem_pack_ukernels
+// CHECK: func.func @elem_pack_ukernels
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
-hal.executable private @copy_cst {
- hal.executable.variant public @vmvx_bytecode_fb target(<"vmvx", "vmvx-bytecode-fb", {ukernels = "none"}>) {
- hal.executable.export public @copy_cst ordinal(0) layout(#hal.pipeline.layout<push_constants = 10, sets = [<0, bindings = [<0, storage_buffer>]>]>) {
- ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
- %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
- hal.return %x, %y, %z : index, index, index
- }
- builtin.module {
- func.func @copy_cst() {
- %cst = arith.constant dense<4.200000e-01> : tensor<5x19x8x4xf32>
- %c32_i64 = arith.constant 32 : i64
- %0 = hal.interface.constant.load[0] : i32
- %1 = hal.interface.constant.load[1] : i32
- %2 = arith.extui %0 : i32 to i64
- %3 = arith.extui %1 : i32 to i64
- %4 = arith.shli %3, %c32_i64 : i64
- %5 = arith.ori %2, %4 : i64
- %6 = arith.index_castui %5 : i64 to index
- %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) : !flow.dispatch.tensor<writeonly:tensor<5x19x8x4xf32>>
- flow.dispatch.tensor.store %cst, %7, offsets = [0, 0, 0, 0], sizes = [5, 19, 8, 4], strides = [1, 1, 1, 1] : tensor<5x19x8x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<5x19x8x4xf32>>
- return
- }
- }
+#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "none"}>
+module {
+ func.func @copy_cst() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} {
+ %cst = arith.constant dense<4.200000e-01> : tensor<5x19x8x4xf32>
+ %c32_i64 = arith.constant 32 : i64
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = arith.extui %0 : i32 to i64
+ %3 = arith.extui %1 : i32 to i64
+ %4 = arith.shli %3, %c32_i64 : i64
+ %5 = arith.ori %2, %4 : i64
+ %6 = arith.index_castui %5 : i64 to index
+ %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) : !flow.dispatch.tensor<writeonly:tensor<5x19x8x4xf32>>
+ flow.dispatch.tensor.store %cst, %7, offsets = [0, 0, 0, 0], sizes = [5, 19, 8, 4], strides = [1, 1, 1, 1] : tensor<5x19x8x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<5x19x8x4xf32>>
+ return
}
}
+
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<VMVXDefault>
-// CHECK: hal.executable.export public @copy_cst
+// CHECK: func.func @copy_cst
// CHECK-SAME: translation_info = #[[TRANSLATION]]
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
index 867af80..92894fe 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
@@ -2095,6 +2095,15 @@
// Calculates an XYZ workgroup size based on the given |workload|.
std::array<Value, 3> calculateWorkgroupSize(
Location loc, Value device, ValueRange workload, OpBuilder &builder);
+
+ // Helper function to return the subgroup size.
+ std::optional<uint64_t> getSubgroupSizeAsUInt() {
+ std::optional<APInt> subgroupSizeUInt = getSubgroupSize();
+ if (!subgroupSizeUInt) {
+ return std::nullopt;
+ }
+ return subgroupSizeUInt->getZExtValue();
+ }
}];
}
diff --git a/compiler/src/iree/compiler/Dialect/VMVX/Transforms/Passes.cpp b/compiler/src/iree/compiler/Dialect/VMVX/Transforms/Passes.cpp
index c787d57..77dbaad 100644
--- a/compiler/src/iree/compiler/Dialect/VMVX/Transforms/Passes.cpp
+++ b/compiler/src/iree/compiler/Dialect/VMVX/Transforms/Passes.cpp
@@ -28,120 +28,123 @@
namespace mlir::iree_compiler::IREE::VMVX {
-using FunctionLikeNest =
- MultiOpNest<func::FuncOp, IREE::Util::InitializerOp, IREE::Util::FuncOp>;
-
// ---------------------------------------------------------------------------
// Variant configuration
// ---------------------------------------------------------------------------
-void buildVMVXConfigurationPassPipeline(OpPassManager &passManager) {
- // ---------------------------------------------------------------------------
- // Tensor-level optimization, kernel dispatch and lower to buffers.
- // ---------------------------------------------------------------------------
- addCommonTargetExecutablePreprocessingPasses(passManager);
- FunctionLikeNest(passManager.nest<ModuleOp>()).addPass([&]() {
- return createCPUMaterializeEncodingPass();
- });
- // TODO: Remove the following pass the plumb support for #hal.descriptor_type
- // memory space through the stack.
- passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
- passManager.addPass(createVMVXSelectLoweringStrategyPass());
+static void
+buildVMVXConfigurationPassPipelineImpl(OpPassManager &modulePassManager) {
+ {
+ FunctionLikeNest funcPassManager(modulePassManager);
+ // ---------------------------------------------------------------------------
+ // Tensor-level optimization, kernel dispatch and lower to buffers.
+ // ---------------------------------------------------------------------------
+ addCommonTargetExecutablePreprocessingPasses(funcPassManager);
+ }
+ modulePassManager.addPass(createMaterializeUserConfigsPass());
+ FunctionLikeNest(modulePassManager)
+ .addPass([&]() { return createCPUMaterializeEncodingPass(); })
+ // TODO: Remove the following pass the plumb support for
+ // #hal.descriptor_type memory space through the stack.
+ .addPass(createEraseHALDescriptorTypeFromMemRefPass);
+ modulePassManager.addPass(createVMVXSelectLoweringStrategyPass());
+}
+
+void buildVMVXConfigurationPassPipeline(OpPassManager &variantPassManager) {
+ OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
+ buildVMVXConfigurationPassPipelineImpl(modulePassManager);
}
// ---------------------------------------------------------------------------
// Variant Translation
// ---------------------------------------------------------------------------
-static void buildVectorVMVXTransformPassPipeline(OpPassManager &passManager) {
+static void
+buildVectorVMVXTransformPassPipeline(OpPassManager &variantPassManager) {
+
+ OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
// ---------------------------------------------------------------------------
// Tensor-level optimization, kernel dispatch and lower to buffers.
// ---------------------------------------------------------------------------
- passManager.addPass(createVMVXLowerExecutableTargetPass());
-
- OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
+ {
+ FunctionLikeNest(modulePassManager)
+ .addPass(createVMVXLowerExecutableTargetPass);
+ }
+ modulePassManager.addPass(createLowerUKernelOpsToCallsPass());
// ---------------------------------------------------------------------------
// Linalg -> Vectors
// ---------------------------------------------------------------------------
- // Tiling and distribution.
- FunctionLikeNest(nestedModulePM).addPass(createCanonicalizerPass);
- // TODO(#5925): This can also be modified to just use the dynamic pass
- // pipeline like the CPU side.
- // FunctionLikeNest(nestedModulePM).addPass(
- // createLinalgTileAndVectorizeWorkgroupsPass);
+ FunctionLikeNest(modulePassManager)
+ .addPass(createCanonicalizerPass)
- // Linalg -> SCF.
- FunctionLikeNest(nestedModulePM)
- .addPass(IREE::LinalgExt::createLinalgExtToLoopsPass);
- FunctionLikeNest(nestedModulePM).addPass(createMemrefCopyToLinalgPass);
- FunctionLikeNest(nestedModulePM).addPass(createConvertLinalgToLoopsPass);
- FunctionLikeNest(nestedModulePM).addPass(createCanonicalizerPass);
- FunctionLikeNest(nestedModulePM).addPass(createCSEPass);
- FunctionLikeNest(nestedModulePM).addPass([]() {
- return createConvertVectorToSCFPass();
- });
- FunctionLikeNest(nestedModulePM).addPass(createCanonicalizerPass);
- FunctionLikeNest(nestedModulePM).addPass(memref::createExpandOpsPass);
+ // Linalg -> SCF.
+ .addPass(IREE::LinalgExt::createLinalgExtToLoopsPass)
+ .addPass(createMemrefCopyToLinalgPass)
+ .addPass(createConvertLinalgToLoopsPass)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass)
+ .addPass([]() { return createConvertVectorToSCFPass(); })
+ .addPass(createCanonicalizerPass)
+ .addPass(memref::createExpandOpsPass);
// Handle tensor-type constants.
- nestedModulePM.addPass(arith::createConstantBufferizePass());
- nestedModulePM.addPass(createFoldTensorExtractOpPass());
+ modulePassManager.addPass(arith::createConstantBufferizePass());
+ FunctionLikeNest(modulePassManager)
+ .addPass(createFoldTensorExtractOpPass)
- // Resolve get_buffer_descriptor ops. All structural buffer manipulations
- // must conclude before this point.
- FunctionLikeNest(nestedModulePM).addPass(createIREEExpandStridedMetadataPass);
- FunctionLikeNest(nestedModulePM).addPass(createResolveBufferDescriptorsPass);
- FunctionLikeNest(nestedModulePM).addPass(createCleanupBufferAllocViewPass);
+ // Resolve get_buffer_descriptor ops. All structural buffer manipulations
+ // must conclude before this point.
+ .addPass(createIREEExpandStridedMetadataPass)
+ .addPass(createResolveBufferDescriptorsPass)
+ .addPass(createCleanupBufferAllocViewPass)
- // Flatten and cleanup memrefs.
- FunctionLikeNest(nestedModulePM)
- .addPass(memref::createFoldMemRefAliasOpsPass);
- nestedModulePM.addPass(createCanonicalizerPass());
- nestedModulePM.addPass(createCSEPass());
- nestedModulePM.addPass(createFlattenMemRefSubspanPass());
- nestedModulePM.addPass(memref::createNormalizeMemRefsPass());
- nestedModulePM.addNestedPass<func::FuncOp>(
- affine::createAffineScalarReplacementPass());
- nestedModulePM.addPass(createCanonicalizerPass());
+ // Flatten and cleanup memrefs.
+ .addPass(memref::createFoldMemRefAliasOpsPass)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass);
+
+ modulePassManager.addPass(createFlattenMemRefSubspanPass());
+ modulePassManager.addPass(memref::createNormalizeMemRefsPass());
+
+ FunctionLikeNest(modulePassManager)
+ .addPass(affine::createAffineScalarReplacementPass)
+ .addPass(createCanonicalizerPass)
+ .addPass(createCSEPass);
}
-static void
-buildLoopOptimizationVMVXTransformPassPipeline(OpPassManager &passManager) {
- OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
-
- FunctionLikeNest(nestedModulePM).addPass(createLowerAffinePass);
- FunctionLikeNest(nestedModulePM).addPass(createForOpCanonicalizationPass);
- FunctionLikeNest(nestedModulePM).addPass(createLoopInvariantCodeMotionPass);
+static void buildLoopOptimizationVMVXTransformPassPipeline(
+ FunctionLikeNest &funcPassManager) {
+ funcPassManager.addPass(createLowerAffinePass)
+ .addPass(createForOpCanonicalizationPass)
+ .addPass(createLoopInvariantCodeMotionPass);
}
-void buildVMVXTransformPassPipeline(OpPassManager &passManager) {
+void buildVMVXTransformPassPipeline(OpPassManager &variantPassManager) {
// ---------------------------------------------------------------------------
// Linalg -> Scalars/Vectors
// ---------------------------------------------------------------------------
- buildVectorVMVXTransformPassPipeline(passManager);
-
- passManager.addPass(createCanonicalizerPass());
- passManager.addPass(createCSEPass());
+ buildVectorVMVXTransformPassPipeline(variantPassManager);
// ---------------------------------------------------------------------------
// Standard/Vector/HAL/etc -> VMVX conversion
// ---------------------------------------------------------------------------
- passManager.addNestedPass<mlir::ModuleOp>(createMaterializeConstantsPass());
- passManager.addNestedPass<mlir::ModuleOp>(createConversionPass());
- passManager.addPass(createCanonicalizerPass());
- passManager.addPass(createCSEPass());
+ OpPassManager &modulePassManager = variantPassManager.nest<mlir::ModuleOp>();
+ modulePassManager.addPass(createMaterializeConstantsPass());
+ modulePassManager.addPass(createConversionPass());
+
+ FunctionLikeNest funcPassManager(modulePassManager);
+ funcPassManager.addPass(createCanonicalizerPass).addPass(createCSEPass);
// ---------------------------------------------------------------------------
// Cleanup and canonicalization
// ---------------------------------------------------------------------------
- buildLoopOptimizationVMVXTransformPassPipeline(passManager);
- passManager.addPass(createCanonicalizerPass());
- passManager.addPass(createCSEPass());
+ buildLoopOptimizationVMVXTransformPassPipeline(funcPassManager);
+ funcPassManager.addPass(createCanonicalizerPass).addPass(createCSEPass);
}
namespace {
@@ -156,15 +159,15 @@
static PassPipelineRegistration<> configurationPassPipeline(
"iree-vmvx-configuration-pipeline",
"Runs the full IREE VMVX dialect configuration pipeline",
- [](OpPassManager &passManager) {
- buildVMVXConfigurationPassPipeline(passManager);
+ [](OpPassManager &modulePassManager) {
+ buildVMVXConfigurationPassPipeline(modulePassManager);
});
static PassPipelineRegistration<> transformPassPipeline(
"iree-vmvx-transformation-pipeline",
"Runs the full IREE VMVX dialect transformation pipeline",
- [](OpPassManager &passManager) {
- buildVMVXTransformPassPipeline(passManager);
+ [](OpPassManager &variantPassManager) {
+ buildVMVXTransformPassPipeline(variantPassManager);
});
}
diff --git a/compiler/src/iree/compiler/Dialect/VMVX/Transforms/Passes.h b/compiler/src/iree/compiler/Dialect/VMVX/Transforms/Passes.h
index ffcb165..1b4ef57 100644
--- a/compiler/src/iree/compiler/Dialect/VMVX/Transforms/Passes.h
+++ b/compiler/src/iree/compiler/Dialect/VMVX/Transforms/Passes.h
@@ -22,7 +22,7 @@
// Adds a set of passes to the given pass manager that configure the required
// VMVX transforms and tiling parameters.
-void buildVMVXConfigurationPassPipeline(OpPassManager &passManager);
+void buildVMVXConfigurationPassPipeline(OpPassManager &variantPassManager);
// Adds a set of passes to the given pass manager that run the required VMVX
// transforms in the canonical order.
@@ -35,7 +35,7 @@
// buildVMVXConfigurationPassPipeline & run
// buildVMVXTransformPassPipeline & run
// <serialize VM module>
-void buildVMVXTransformPassPipeline(OpPassManager &passManager);
+void buildVMVXTransformPassPipeline(OpPassManager &variantPassManager);
//===----------------------------------------------------------------------===//
// Dialect conversion
diff --git a/compiler/src/iree/compiler/Utils/PassUtils.h b/compiler/src/iree/compiler/Utils/PassUtils.h
index e358cd1..ab525d5 100644
--- a/compiler/src/iree/compiler/Utils/PassUtils.h
+++ b/compiler/src/iree/compiler/Utils/PassUtils.h
@@ -4,8 +4,8 @@
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#ifndef IREE_COMPILER_UTILS_FUNCTIONUTILS_H_
-#define IREE_COMPILER_UTILS_FUNCTIONUTILS_H_
+#ifndef IREE_COMPILER_UTILS_PASSUTILS_H_
+#define IREE_COMPILER_UTILS_PASSUTILS_H_
#include <array>
@@ -84,4 +84,4 @@
} // namespace mlir::iree_compiler
-#endif // IREE_COMPILER_UTILS_FUNCTIONUTILS_H_
+#endif // IREE_COMPILER_UTILS_PASSUTILS_H_
diff --git a/samples/transform_dialect/example_module.mlir b/samples/transform_dialect/example_module.mlir
index 9ea0f78..13128e1 100644
--- a/samples/transform_dialect/example_module.mlir
+++ b/samples/transform_dialect/example_module.mlir
@@ -125,7 +125,7 @@
// CODEGEN-PRINTER: IR printer: Setting matmul strategy to custom_transform_strategy
// CODEGEN-PRINTER: translation_info = #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @custom_transform_strategy>
// CODEGEN-PRINTER: IR printer: Setting reduce strategy to base vectorize top-level
-// CODEGEN-PRINTER: translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [16 : index, 1 : index, 1 : index]
+// CODEGEN-PRINTER: translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [16, 1, 1]>
/// Then test with threading to make sure it runs
// RUN: iree-compile %s --iree-hal-target-backends=vulkan \
@@ -135,9 +135,6 @@
// RUN: --mlir-disable-threading | \
// RUN: FileCheck %s --check-prefixes=CODEGEN
-// CODEGEN: Ran custom_transform_strategy
// CODEGEN: spirv.func @example_module_dispatch_0_generic_80_f32
-// CODEGEN: hal.executable private @example_module_dispatch_1
-// CODEGEN: #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @custom_transform_strategy>
-// CODEGEN: spirv.func @example_module_dispatch_1_matmul_16x16x5_f32
+// CODEGEN: spirv.func @example_module_dispatch_1_matmul_16x16x5_f32
// CODEGEN: spirv.func @example_module_dispatch_2_generic_16x16_f32
diff --git a/samples/transform_dialect/transform_library.mlir b/samples/transform_dialect/transform_library.mlir
index 8b17af7..21349fa 100644
--- a/samples/transform_dialect/transform_library.mlir
+++ b/samples/transform_dialect/transform_library.mlir
@@ -3,7 +3,7 @@
// the name of this strategy down below before strategy selection, overriding
// default IREE codegen.
transform.named_sequence @custom_transform_strategy(
- %variant_op: !transform.any_op {transform.consumed}) {
+ %variant_op: !transform.any_op) {
// Step 1. Re-match the matmul
// ===========================================================================
%matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
@@ -40,19 +40,18 @@
transform.apply_patterns to %func_1 {
transform.apply_patterns.linalg.erase_unnecessary_inputs
} : !transform.any_op
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %memref_func = transform.iree.bufferize { target_gpu } %func_1 : (!transform.any_op) -> (!transform.any_op)
// Step 6. Post-bufferization vector distribution
// ===========================================================================
- %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %func_7 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
transform.iree.map_nested_forall_to_gpu_threads %func_7
workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
// Step 7. Do layout analysis and lower to mma
// ===========================================================================
- %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %func_10 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
transform.print {name = "Ran custom_transform_strategy"}
transform.yield
@@ -61,10 +60,10 @@
// Send it down a custom transform dialect pipeline.
transform.named_sequence @custom_matmul(%matmul: !transform.any_op {transform.readonly}) {
%variant_op = transform.get_parent_op %matmul {op_name = "hal.executable.variant"} : (!transform.any_op) -> !transform.any_op
- %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %funcs = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
%subgroup_reduce = transform.param.constant #iree_codegen.translation_info<TransformDialectCodegen
codegen_spec = @custom_transform_strategy> -> !transform.any_param
- transform.annotate %exports "translation_info" = %subgroup_reduce : !transform.any_op, !transform.any_param
+ transform.annotate %funcs "translation_info" = %subgroup_reduce : !transform.any_op, !transform.any_param
transform.print {name = "Setting matmul strategy to custom_transform_strategy"}
transform.yield
}
@@ -74,11 +73,9 @@
%variant_op = transform.get_parent_op %reduce {op_name = "hal.executable.variant"} : (!transform.any_op) -> !transform.any_op
%lowering_config = transform.param.constant #iree_codegen.lowering_config<tile_sizes = [[8, 0], [1, 0], [0, 0, 4]]> -> !transform.any_param
transform.annotate %reduce "lowering_config" = %lowering_config : !transform.any_op, !transform.any_param
- %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op : (!transform.any_op) -> !transform.any_op
- %subgroup_reduce = transform.param.constant #iree_codegen.translation_info<SPIRVBaseVectorize> -> !transform.any_param
- %workgroup_size = transform.param.constant [16 : index, 1 : index, 1 : index] -> !transform.any_param
- transform.annotate %exports "translation_info" = %subgroup_reduce : !transform.any_op, !transform.any_param
- transform.annotate %exports "workgroup_size" = %workgroup_size : !transform.any_op, !transform.any_param
+ %funcs = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %subgroup_reduce = transform.param.constant #iree_codegen.translation_info<SPIRVBaseVectorize workgroup_size = [16, 1, 1]> -> !transform.any_param
+ transform.annotate %funcs "translation_info" = %subgroup_reduce : !transform.any_op, !transform.any_param
transform.print {name = "Setting reduce strategy to base vectorize"}
transform.yield
}
diff --git a/tests/e2e/matmul/generate_e2e_matmul_tests.py b/tests/e2e/matmul/generate_e2e_matmul_tests.py
index e7e6bed..ba61828 100644
--- a/tests/e2e/matmul/generate_e2e_matmul_tests.py
+++ b/tests/e2e/matmul/generate_e2e_matmul_tests.py
@@ -118,7 +118,7 @@
# Prints the workgroup size
def workgroup_size_str(self):
- return "[" + ", ".join(map(str, self.workgroup_size)) + "]"
+ return "workgroup_size = [" + ", ".join(map(str, self.workgroup_size)) + "]"
# Returns the list of TestShape's to use for the collection of shapes
@@ -537,10 +537,9 @@
f"#compilation{generate_function.compilation_index} = "
"#iree_codegen.compilation_info<\n"
f" lowering_config = <tile_sizes = {compilation_info.tile_sizes}>,\n"
- f" translation_info = <{compiler_pipeline},\n"
+ f" translation_info = <{compiler_pipeline} {compilation_info.workgroup_size_str()},\n"
f" {{ pipeline_depth = {compilation_info.software_pipeline_depth}, "
- f" store_stage = 1{mma_schedule} }}>,\n"
- f" workgroup_size = {compilation_info.workgroup_size_str()}>\n"
+ f" store_stage = 1{mma_schedule} }}>>\n"
)
compilation_info_attr = (
f"{{compilation_info = #compilation{generate_function.compilation_index}}} "
diff --git a/tests/transform_dialect/cpu/attention_codegen_spec.mlir b/tests/transform_dialect/cpu/attention_codegen_spec.mlir
index d768936..5b64367 100644
--- a/tests/transform_dialect/cpu/attention_codegen_spec.mlir
+++ b/tests/transform_dialect/cpu/attention_codegen_spec.mlir
@@ -42,22 +42,21 @@
transform.apply_patterns.scf.for_loop_canonicalization
transform.apply_patterns.canonicalization
} : !transform.any_op
- transform.iree.apply_licm %variant_op : !transform.any_op
- transform.apply_cse to %variant_op : !transform.any_op
+ transform.iree.apply_licm %func_3 : !transform.any_op
+ transform.apply_cse to %func_3 : !transform.any_op
// Bufferization
// ==========================================
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> ()
transform.apply_patterns to %func_3 {
transform.apply_patterns.linalg.erase_unnecessary_inputs
} : !transform.any_op
- %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
+ %func_4 = transform.iree.bufferize %func_3 : (!transform.any_op) -> (!transform.any_op)
// Step 6. Post-bufferization vector distribution
// ===========================================================================
- %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
- %func_8 = transform.structured.hoist_redundant_vector_transfers %func_7
+ transform.iree.forall_to_workgroup %func_4 : (!transform.any_op) -> ()
+ %func_8 = transform.structured.hoist_redundant_vector_transfers %func_4
: (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %func_8 {
transform.apply_patterns.canonicalization
@@ -66,9 +65,8 @@
transform.memref.erase_dead_alloc_and_stores %func_8 : (!transform.any_op) -> ()
// Annotate the exported function as already translated.
- %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
%none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
- transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+ transform.annotate %func_8 "translation_info" = %none : !transform.any_op, !transform.any_param
transform.yield
} // codegen
diff --git a/tests/transform_dialect/cpu/transform_library.mlir b/tests/transform_dialect/cpu/transform_library.mlir
index 6bb1e12..69d946d 100644
--- a/tests/transform_dialect/cpu/transform_library.mlir
+++ b/tests/transform_dialect/cpu/transform_library.mlir
@@ -21,16 +21,15 @@
transform.apply_patterns.canonicalization
} : !transform.any_op
transform.apply_cse to %func_op : !transform.any_op
- %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> (!transform.any_op)
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
+ %func_op_3 = transform.iree.bufferize %func_op : (!transform.any_op) -> (!transform.any_op)
+ %memref_func = transform.structured.match ops{["func.func"]} in %func_op_3
: (!transform.any_op) -> !transform.any_op
transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
// CSE is needed on the workgroup_count region to pass this particular test.
- transform.apply_cse to %variant_op_3 : !transform.any_op
- %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ transform.apply_cse to %memref_func : !transform.any_op
%none_attr = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
- transform.annotate %exports "translation_info" = %none_attr : !transform.any_op, !transform.any_param
+ transform.annotate %memref_func "translation_info" = %none_attr : !transform.any_op, !transform.any_param
transform.yield
}
}
diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
index 02a1d92..c695996 100644
--- a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
@@ -2,7 +2,7 @@
module attributes { transform.with_named_sequence } {
transform.named_sequence @codegen(
- %variant_op: !transform.any_op {transform.consumed}) {
+ %variant_op: !transform.any_op) {
// Step 1. Find the fill and matmul ops
// ===========================================================================
@@ -43,35 +43,30 @@
transform.apply_patterns.canonicalization
} : !transform.any_op
transform.apply_cse to %func_3 : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> ()
transform.apply_patterns to %func_3 {
transform.apply_patterns.linalg.erase_unnecessary_inputs
} : !transform.any_op
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op)
// Step 5. Pre-process the contract and transfer ops to put it in the right form.
// ===========================================================================
- %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_2 {
+ transform.apply_patterns to %memref_func {
transform.apply_patterns.iree.prepare_vector_to_mma
} : !transform.any_op
// Step 6. Post-bufferization vector distribution
// ===========================================================================
- %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+ transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
// Step 7. Do layout analysis and lower to mma
// ===========================================================================
- %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+ %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op)
// Annotate the exported function as already translated.
- %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
%none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
- transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+ transform.annotate %func_11 "translation_info" = %none : !transform.any_op, !transform.any_param
transform.yield
}
diff --git a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
index d1362d0..0abe3bc 100644
--- a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
@@ -2,7 +2,7 @@
module attributes { transform.with_named_sequence } {
transform.named_sequence @__transform_main(
- %variant_op: !transform.any_op {transform.consumed}) {
+ %variant_op: !transform.any_op) {
// Step 1. Find the fill, matmul and generic ops
// ===========================================================================
%fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
@@ -44,29 +44,25 @@
transform.apply_patterns.canonicalization
} : !transform.any_op
transform.apply_cse to %func_3 : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> ()
transform.apply_patterns to %func_3 {
transform.apply_patterns.linalg.erase_unnecessary_inputs
} : !transform.any_op
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op)
// Step 6. Post-bufferization vector distribution
// ===========================================================================
- %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_7
+ transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %memref_func
workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
// Step 7. Do layout analysis and lower to mma
// ===========================================================================
- %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+ %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op)
// Annotate the exported function as already translated.
- %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
%none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
- transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+ transform.annotate %func_11 "translation_info" = %none : !transform.any_op, !transform.any_param
transform.yield
}
} // module
diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
index a7c32a4..0549eda 100644
--- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
@@ -2,7 +2,7 @@
module attributes { transform.with_named_sequence } {
transform.named_sequence @codegen(
- %variant_op: !transform.any_op {transform.consumed}) {
+ %variant_op: !transform.any_op) {
// Step 1. Find the fill, matmul and generic ops
// ===========================================================================
@@ -48,28 +48,24 @@
transform.apply_patterns.canonicalization
} : !transform.any_op
transform.apply_cse to %func_3 : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> ()
transform.apply_patterns to %func_3 {
transform.apply_patterns.linalg.erase_unnecessary_inputs
} : !transform.any_op
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op)
// Step 6. Post-bufferization vector distribution
// ===========================================================================
- %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+ transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
// Step 7. Do layout analysis and lower to mma
// ===========================================================================
- %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+ %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op)
// Annotate the exported function as already translated.
- %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
%none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
- transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+ transform.annotate %func_11 "translation_info" = %none : !transform.any_op, !transform.any_param
transform.yield
}
} // module
diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
index 83e2496..0a6546d 100644
--- a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
@@ -45,36 +45,31 @@
transform.apply_patterns.canonicalization
} : !transform.any_op
transform.apply_cse to %func_3 : !transform.any_op
- transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+ transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> ()
transform.apply_patterns to %func_3 {
transform.apply_patterns.linalg.erase_unnecessary_inputs
} : !transform.any_op
- %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
- %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+ %memref_func = transform.iree.bufferize { target_gpu } %func_3: (!transform.any_op) -> (!transform.any_op)
// Step 5. Pre-process the contract and transfer ops to put it in the right form.
// ===========================================================================
- %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.apply_patterns to %func_2 {
+ transform.apply_patterns to %memref_func {
transform.apply_patterns.iree.prepare_vector_to_mma
} : !transform.any_op
// Step 6. Post-bufferization vector distribution
// ===========================================================================
- %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
- transform.iree.map_nested_forall_to_gpu_threads %func_7
+ transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %memref_func
workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
// Step 7. Do layout analysis and lower to mma
// ===========================================================================
- %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
- %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+ %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op)
// Annotate the exported function as already translated.
- %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
%none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
- transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+ transform.annotate %func_11 "translation_info" = %none : !transform.any_op, !transform.any_param
transform.yield
}
} // module