Merge google -> main (#7667)
* bb1e2a751 Synchronize submodules with LLVM at llvm/llvm-project@8909dc5ebe8a
* 206b858d2 Run bazel_to_cmake on iree/compiler/Codegen/Common/.
* e3e22e769 Merge pull request #7664 from GMNGeoffrey:main-to-google
* d4725075d Integrate LLVM at llvm/llvm-project@8909dc5ebe8a
* 1c01a25cf Integrate LLVM at llvm/llvm-project@4602f52d482c
* cae7b77d2 Migrate android.support.annotation.NonNull to androidx.
* a4cd45079 Integrate LLVM at llvm/llvm-project@f46f93b47863
* 9685f4c90 Integrate LLVM at llvm/llvm-project@1d7fdbbc183a
* 5446edced Integrate LLVM at llvm/llvm-project@42102bce98e5
diff --git a/integrations/tensorflow/iree_tf_compiler/TFL/BUILD b/integrations/tensorflow/iree_tf_compiler/TFL/BUILD
index 67079cd..bfff2f9 100644
--- a/integrations/tensorflow/iree_tf_compiler/TFL/BUILD
+++ b/integrations/tensorflow/iree_tf_compiler/TFL/BUILD
@@ -56,6 +56,7 @@
"@llvm-project//mlir:IR",
"@llvm-project//mlir:Pass",
"@llvm-project//mlir:QuantOps",
+ "@llvm-project//mlir:ReconcileUnrealizedCasts",
"@llvm-project//mlir:Shape",
"@llvm-project//mlir:ShapeTransforms",
"@llvm-project//mlir:StandardOps",
diff --git a/integrations/tensorflow/iree_tf_compiler/TFL/LowerGlobalTensors.cpp b/integrations/tensorflow/iree_tf_compiler/TFL/LowerGlobalTensors.cpp
index 67930ce..d5f8ddf 100644
--- a/integrations/tensorflow/iree_tf_compiler/TFL/LowerGlobalTensors.cpp
+++ b/integrations/tensorflow/iree_tf_compiler/TFL/LowerGlobalTensors.cpp
@@ -10,6 +10,9 @@
#include "iree/compiler/Utils/ConversionUtils.h"
#include "iree_tf_compiler/TFL/PassDetail.h"
#include "iree_tf_compiler/TFL/Passes.h"
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/DialectConversion.h"
@@ -24,7 +27,7 @@
: public LowerGlobalTensorsBase<LowerGlobalTensorsPass> {
public:
void getDependentDialects(DialectRegistry& registry) const override {
- registry.insert<mlir::TFL::TensorFlowLiteDialect,
+ registry.insert<mlir::TFL::TensorFlowLiteDialect, tosa::TosaDialect,
iree_compiler::IREE::Util::UtilDialect>();
}
@@ -49,14 +52,29 @@
// Look through the initialization functions and find the assigned values
// for each handle, save out the constant value.
for (auto init : func.getOps<mlir::TFL::CallOnceOp>()) {
- FuncOp initFunc = symNameToFunction[init.session_init_function()];
+ auto findInitFunc =
+ symNameToFunction.find(init.session_init_function());
+ if (findInitFunc == symNameToFunction.end()) {
+ init.emitError("Unable to find initialization function: " +
+ init.session_init_function());
+ continue;
+ }
+ FuncOp initFunc = std::get<1>(*findInitFunc);
for (auto assign : initFunc.getOps<mlir::TFL::AssignVariableOp>()) {
auto handle = dyn_cast<mlir::TFL::VarHandleOp>(
assign.resource_id().getDefiningOp());
if (!handle) continue;
DenseElementsAttr constant;
- if (!matchPattern(assign.value(), m_Constant(&constant))) continue;
+ if (!matchPattern(assign.value(), m_Constant(&constant))) {
+ // Quantized types we can not use the m_Constant matcher.
+ if (auto constOp = dyn_cast<mlir::TFL::QConstOp>(
+ assign.value().getDefiningOp())) {
+ constant = constOp.value().cast<DenseElementsAttr>();
+ }
+ }
+ if (!constant) continue;
+
auto name = handle.shared_name();
sharedNameToConstant[name] = constant;
sharedNameToLoc[name] = handle.getLoc();
@@ -131,8 +149,19 @@
if (!address) continue;
builder.setInsertionPoint(assign);
+ Value value = assign.value();
+ Type storageType = address.getType()
+ .cast<iree_compiler::IREE::Util::PtrType>()
+ .getTargetType();
+ if (storageType != value.getType()) {
+ value = builder
+ .create<UnrealizedConversionCastOp>(assign.getLoc(),
+ storageType, value)
+ .getResult(0);
+ }
+
builder.create<iree_compiler::IREE::Util::GlobalStoreIndirectOp>(
- assign.getLoc(), assign.value(), assign.resource_id());
+ assign.getLoc(), value, assign.resource_id());
assign.erase();
}
@@ -149,9 +178,15 @@
auto type = ptrType.getTargetType();
builder.setInsertionPoint(read);
- auto load =
+ Value load =
builder.create<iree_compiler::IREE::Util::GlobalLoadIndirectOp>(
read.getLoc(), type, read.resource_id());
+ if (type != read.getResult().getType()) {
+ load = builder
+ .create<UnrealizedConversionCastOp>(
+ read.getLoc(), read.getResult().getType(), load)
+ .getResult(0);
+ }
read.getResult().replaceAllUsesWith(load);
read.erase();
}
diff --git a/integrations/tensorflow/iree_tf_compiler/TFL/Passes.cpp b/integrations/tensorflow/iree_tf_compiler/TFL/Passes.cpp
index ef90c13..3d18060 100644
--- a/integrations/tensorflow/iree_tf_compiler/TFL/Passes.cpp
+++ b/integrations/tensorflow/iree_tf_compiler/TFL/Passes.cpp
@@ -7,6 +7,7 @@
#include "iree_tf_compiler/TFL/Passes.h"
#include "iree/compiler/Dialect/Shape/Transforms/Passes.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
#include "mlir/Dialect/Shape/Transforms/Passes.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Pass/PassRegistry.h"
@@ -56,6 +57,7 @@
mlir::tosa::createTFTFLtoTOSALegalizationPipeline(pm, tosaOptions);
pm.nest<FuncOp>().addPass(mlir::tosa::createStripQuantTypesPass());
pm.addPass(createCanonicalizerPass());
+ pm.addPass(createReconcileUnrealizedCastsPass());
//----------------------------------------------------------------------------
// Lowering shape-related constructs
diff --git a/integrations/tensorflow/iree_tf_compiler/TFL/VerifyFullyConverted.cpp b/integrations/tensorflow/iree_tf_compiler/TFL/VerifyFullyConverted.cpp
index 3a6a782..1e46707 100644
--- a/integrations/tensorflow/iree_tf_compiler/TFL/VerifyFullyConverted.cpp
+++ b/integrations/tensorflow/iree_tf_compiler/TFL/VerifyFullyConverted.cpp
@@ -28,6 +28,7 @@
ConversionTarget target(getContext());
target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
target.addIllegalDialect<mlir::TFL::TensorFlowLiteDialect>();
+ target.addIllegalOp<mlir::UnrealizedConversionCastOp>();
if (failed(
iree_compiler::verifyAllOperationsAreLegal(getOperation(), target)))
return signalPassFailure();
diff --git a/integrations/tensorflow/iree_tf_compiler/TFL/test/lower_global_tensors.mlir b/integrations/tensorflow/iree_tf_compiler/TFL/test/lower_global_tensors.mlir
index e9d8668..15a4b94 100644
--- a/integrations/tensorflow/iree_tf_compiler/TFL/test/lower_global_tensors.mlir
+++ b/integrations/tensorflow/iree_tf_compiler/TFL/test/lower_global_tensors.mlir
@@ -1,9 +1,8 @@
// RUN: iree-opt-tflite -split-input-file -allow-unregistered-dialect -pass-pipeline='iree-tflite-lower-global-tensors' %s | IreeFileCheck %s
-// CHECK-LABEL: module {
module {
// CHECK: util.global private mutable @__iree_flow_Variable = dense<1.000000e+00> : tensor<16x16xf32>
- // CHECK: func @state
+ // CHECK-LABEL: func @state
func @state(%arg0: tensor<16x16xf32>) -> () {
"tfl.call_once"() {session_init_function = "StateInit"} : () -> ()
return
@@ -19,11 +18,10 @@
// -----
-// CHECK-LABEL: module {
module {
// CHECK: util.global private mutable @__iree_flow_Variable = dense<1.000000e+00> : tensor<16x16xf32>
- // CHECK: func @assign
+ // CHECK-LABEL: func @assign
func @assign(%arg0: tensor<16x16xf32>) -> () {
"tfl.call_once"() {session_init_function = "AssignInit"} : () -> ()
// CHECK: %[[ADDR:.+]] = util.global.address @__iree_flow_Variable : !util.ptr<tensor<16x16xf32>>
@@ -44,11 +42,10 @@
// -----
-// CHECK-LABEL: module {
module {
// CHECK: util.global private mutable @__iree_flow_Variable = dense<1.000000e+00> : tensor<16x16xf32>
- // CHECK: func @read
+ // CHECK-LABEL: func @read
func @read(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
"tfl.call_once"() {session_init_function = "ReadInit"} : () -> ()
@@ -70,11 +67,10 @@
// -----
-// CHECK-LABEL: module {
module {
// CHECK: util.global private mutable @__iree_flow_Variable = dense<2.000000e+00> : tensor<16x16xf32>
- // func @readAssign
+ // CHECK-LABEL: func @readAssign
func @readAssign(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
"tfl.call_once"() {session_init_function = "ReadAssignInit"} : () -> ()
// CHECK: %[[ADDR:.+]] = util.global.address @__iree_flow_Variable : !util.ptr<tensor<16x16xf32>>
@@ -101,6 +97,35 @@
// -----
module {
+ // CHECK: util.global private mutable @__iree_flow_Variable = dense<42> : tensor<2x3xi8>
+ // CHECK-LABEL: func @readAssignQuant
+ func @readAssignQuant(%arg0: tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> (tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) {
+ "tfl.call_once"() {session_init_function = "ReadAssignInit"} : () -> ()
+ %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+
+ // CHECK: %[[ADDR:.+]] = util.global.load.indirect %ptr___iree_flow_Variable : !util.ptr<tensor<2x3xi8>> -> tensor<2x3xi8>
+ // CHECK: %[[CAST:.+]] = builtin.unrealized_conversion_cast %[[ADDR]] : tensor<2x3xi8> to tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>>
+ %1 = "tfl.read_variable"(%0) : (tensor<*x!tf_type.resource>) -> tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+
+ // CHECK: %[[ADD:.+]] = tfl.add %[[CAST]], %arg0 {fused_activation_function = "NONE"}
+ %2 = tfl.add %1, %arg0 {fused_activation_function = "NONE"} : tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+
+ // CHECK: %[[CAST2:.+]] = builtin.unrealized_conversion_cast %[[ADD]] : tensor<2x3x!quant.uniform<i8:f32, 1.000000e-01:2>> to tensor<2x3xi8>
+ // CHECK: util.global.store.indirect %[[CAST2]], %ptr___iree_flow_Variable
+ "tfl.assign_variable"(%0, %2) : (tensor<*x!tf_type.resource>, tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> ()
+ return %2 : tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+ }
+ func private @ReadAssignInit() {
+ %0 = "tfl.var_handle"() {container = "", shared_name = "Variable"} : () -> tensor<*x!tf_type.resource>
+ %1 = "tfl.pseudo_const"() {qtype = tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>, value = dense<42> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>
+ "tfl.assign_variable"(%0, %1) : (tensor<*x!tf_type.resource>, tensor<2x3x!quant.uniform<i8:f32, 0.1:2>>) -> ()
+ return
+ }
+}
+
+// -----
+
+module {
// CHECK-label: @nostate
func @nostate(%arg0: tensor<16x16xf32>) -> (tensor<16x16xf32>) {
"tfl.call_once"() {session_init_function = "NoStateInit"} : () -> ()
diff --git a/iree/compiler/Codegen/BUILD b/iree/compiler/Codegen/BUILD
index 8a91e56..175001a 100644
--- a/iree/compiler/Codegen/BUILD
+++ b/iree/compiler/Codegen/BUILD
@@ -34,6 +34,7 @@
],
deps = [
":PassesIncGen",
+ "//iree/compiler/Codegen/Dialect:IREECodegenDialect",
"//iree/compiler/Dialect/HAL/IR",
"@llvm-project//mlir:LinalgTransforms",
"@llvm-project//mlir:Pass",
@@ -50,6 +51,7 @@
":PassHeaders",
":PassesIncGen",
"//iree/compiler/Codegen/Common",
+ "//iree/compiler/Codegen/Dialect:IREECodegenDialect",
"//iree/compiler/Codegen/LLVMCPU",
"//iree/compiler/Codegen/LLVMGPU",
"//iree/compiler/Codegen/SPIRV",
diff --git a/iree/compiler/Codegen/CMakeLists.txt b/iree/compiler/Codegen/CMakeLists.txt
index dc25c35..0fcd93d 100644
--- a/iree/compiler/Codegen/CMakeLists.txt
+++ b/iree/compiler/Codegen/CMakeLists.txt
@@ -31,6 +31,7 @@
MLIRLinalgTransforms
MLIRPass
MLIRTransforms
+ iree::compiler::Codegen::Dialect::IREECodegenDialect
iree::compiler::Dialect::HAL::IR
PUBLIC
)
@@ -44,6 +45,7 @@
::PassHeaders
::PassesIncGen
iree::compiler::Codegen::Common
+ iree::compiler::Codegen::Dialect::IREECodegenDialect
iree::compiler::Codegen::LLVMCPU
iree::compiler::Codegen::LLVMGPU
iree::compiler::Codegen::SPIRV
diff --git a/iree/compiler/Codegen/Dialect/LoweringConfig.td b/iree/compiler/Codegen/Dialect/LoweringConfig.td
index 62124cf..95251a1 100644
--- a/iree/compiler/Codegen/Dialect/LoweringConfig.td
+++ b/iree/compiler/Codegen/Dialect/LoweringConfig.td
@@ -12,8 +12,6 @@
// List of pre-existing pipelines for translating executables.
def CPU_Default
: StrEnumAttrCase<"CPUDefault">;
-def CPU_Vectorization
- : StrEnumAttrCase<"CPUVectorization">;
def CPU_TensorToVectors
: StrEnumAttrCase<"CPUTensorToVectors">;
def CPU_TileFuseAndVectorize
@@ -42,8 +40,8 @@
"DispatchLoweringPassPipeline",
"identifier for pass pipeline use to lower dispatch region",
[CPU_Default, CPU_TensorToVectors, CPU_TileFuseAndVectorize,
- CPU_Vectorization, LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize,
- LLVMGPU_MatmulSimt, SPIRV_SimpleDistribute, SPIRV_Vectorize,
+ LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize, LLVMGPU_MatmulSimt,
+ SPIRV_SimpleDistribute, SPIRV_Vectorize,
SPIRV_VectorizeToCooperativeOps, None]> {
let cppNamespace = "::mlir::iree_compiler::IREE::Codegen";
}
diff --git a/iree/compiler/Codegen/LLVMCPU/BUILD b/iree/compiler/Codegen/LLVMCPU/BUILD
index 368243a..97c33b6 100644
--- a/iree/compiler/Codegen/LLVMCPU/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/BUILD
@@ -20,7 +20,6 @@
"LLVMCPUTileAndVectorizeLinalgTensorOps.cpp",
"LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp",
"LLVMCPUUnfuseFMAOps.cpp",
- "LLVMCPUVectorization.cpp",
"Passes.cpp",
"VectorContractToAArch64InlineAsmOp.cpp",
],
diff --git a/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
index 4e7ff7a..d2061de 100644
--- a/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
@@ -23,7 +23,6 @@
"LLVMCPUTileAndVectorizeLinalgTensorOps.cpp"
"LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp"
"LLVMCPUUnfuseFMAOps.cpp"
- "LLVMCPUVectorization.cpp"
"Passes.cpp"
"VectorContractToAArch64InlineAsmOp.cpp"
DEPS
diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index f6bef55..9f794ac 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -304,6 +304,9 @@
lb.getInt(), ub.getInt(),
workloadPerWorkgroup[tiledLoops.size() - 1 - i], vectorSizeVals[i]);
}
+ if (isBatchMatmul) {
+ workloadPerWorkgroup.push_back(1);
+ }
setTranslationInfo(
entryPointFn,
clUseTileFuseAndVectorize
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index 0c62f13..72078cd 100644
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -92,6 +92,21 @@
return input;
}
+/// Verify that valid configuration is set for all ops within the compiled
+/// module.
+template <typename F>
+static LogicalResult verifyLoweringConfiguration(
+ ModuleOp module, IREE::Codegen::TranslationInfoAttr translationInfo,
+ F verificationFn) {
+ auto walkResult = module.walk([&](Operation *op) -> WalkResult {
+ IREE::Codegen::LoweringConfigAttr loweringConfig = getLoweringConfig(op);
+ if (!loweringConfig) return WalkResult::advance();
+ return verificationFn(op, loweringConfig, translationInfo,
+ ArrayRef<int64_t>{});
+ });
+ return failure(walkResult.wasInterrupted());
+}
+
void LLVMCPULowerExecutableTargetPass::runOnOperation() {
IREE::HAL::ExecutableVariantOp variantOp = getOperation();
ModuleOp moduleOp = variantOp.getInnerModule();
@@ -118,55 +133,67 @@
}
// There might be multiple entry points in the module. Currently, all of
- // them need to have the same pipeline.
+ // them need to have the same translation info.
// TODO(ravishankarm): This is strange that this is not enforced
- // structurally, but something to address later on. For now this restriction
+ // structurally, but something to address later on. The main issue is how
+ // to invoke separate dynamic pass pipelines on entry point functions, when
+ // the passes might have module level changes. For now this restriction
// is fine.
llvm::StringMap<IREE::HAL::ExecutableEntryPointOp> entryPoints =
getAllEntryPoints(moduleOp);
- Optional<IREE::Codegen::DispatchLoweringPassPipeline> passPipeline;
+ Optional<IREE::Codegen::TranslationInfoAttr> translationInfo;
for (auto &it : entryPoints) {
auto entryPointOp = it.second;
- if (IREE::Codegen::TranslationInfoAttr translationInfo =
+ if (IREE::Codegen::TranslationInfoAttr currTranslationInfo =
getTranslationInfo(entryPointOp)) {
- IREE::Codegen::DispatchLoweringPassPipeline currPipeline =
- translationInfo.getDispatchLoweringPassPipeline();
- if (passPipeline) {
- if (currPipeline != passPipeline.getValue()) {
- moduleOp.emitError(
- "unhandled compilation of entry point function with different "
- "pass pipelines within a module");
- return signalPassFailure();
+ if (translationInfo) {
+ if (currTranslationInfo != translationInfo.getValue()) {
+ moduleOp.emitOpError(
+ "unhandled compilation of entry point functions with different "
+ "translation info");
}
- continue;
+ } else {
+ translationInfo = currTranslationInfo;
}
- passPipeline = currPipeline;
}
}
- executableLoweringPipeline.addPass(createSetNumWorkgroupsPass());
- executableLoweringPipeline.addPass(createCanonicalizerPass());
- if (!testLoweringConfiguration && passPipeline.hasValue()) {
- OpPassManager &nestedModulePM =
- executableLoweringPipeline.nest<ModuleOp>();
- switch (passPipeline.getValue()) {
- case IREE::Codegen::DispatchLoweringPassPipeline::CPUDefault:
- case IREE::Codegen::DispatchLoweringPassPipeline::None:
- addCPUDefaultPassPipeline(nestedModulePM);
- break;
- case IREE::Codegen::DispatchLoweringPassPipeline::CPUVectorization:
- addCPUVectorizationPassPipeline(nestedModulePM, lowerToVectors);
- break;
+ // Verify the configuration.
+ if (translationInfo.hasValue()) {
+ LogicalResult verificationStatus = success();
+ switch (translationInfo.getValue().getDispatchLoweringPassPipeline()) {
case IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors:
- addTensorToVectorsPassPipeline(nestedModulePM, lowerToVectors);
+ verificationStatus = verifyLoweringConfiguration(
+ moduleOp, translationInfo.getValue(),
+ verifyTensorToVectorsPassPipelineConfig);
break;
- case IREE::Codegen::DispatchLoweringPassPipeline::
- CPUTileFuseAndVectorize:
- addTensorToVectorsPassPipeline(nestedModulePM, lowerToVectors,
- /*useTileAndVectorizeV2=*/true);
- break;
- default:
- llvm_unreachable("Unsupported pipeline on CPU target.");
+ default:;
+ }
+ if (failed(verificationStatus)) {
+ return signalPassFailure();
+ }
+
+ executableLoweringPipeline.addPass(createSetNumWorkgroupsPass());
+ executableLoweringPipeline.addPass(createCanonicalizerPass());
+ if (!testLoweringConfiguration) {
+ OpPassManager &nestedModulePM =
+ executableLoweringPipeline.nest<ModuleOp>();
+ switch (translationInfo.getValue().getDispatchLoweringPassPipeline()) {
+ case IREE::Codegen::DispatchLoweringPassPipeline::CPUDefault:
+ case IREE::Codegen::DispatchLoweringPassPipeline::None:
+ addCPUDefaultPassPipeline(nestedModulePM);
+ break;
+ case IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors:
+ addTensorToVectorsPassPipeline(nestedModulePM, lowerToVectors);
+ break;
+ case IREE::Codegen::DispatchLoweringPassPipeline::
+ CPUTileFuseAndVectorize:
+ addTensorToVectorsPassPipeline(nestedModulePM, lowerToVectors,
+ /*useTileAndVectorizeV2=*/true);
+ break;
+ default:
+ llvm_unreachable("Unsupported pipeline on CPU target.");
+ }
}
}
}
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorization.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorization.cpp
deleted file mode 100644
index d27153e..0000000
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorization.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-// Copyright 2020 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
-#include "iree/compiler/Codegen/PassDetail.h"
-#include "iree/compiler/Codegen/Passes.h"
-#include "iree/compiler/Codegen/Transforms/Transforms.h"
-#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
-#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
-#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
-#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
-#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/MemRef/Transforms/Passes.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
-#include "mlir/Dialect/Vector/VectorTransforms.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-#define DEBUG_TYPE "iree-linalg-to-llvm-tile-and-vectorize"
-
-namespace mlir {
-namespace iree_compiler {
-
-namespace {
-// Could just be linalg::TilingPattern with a ContractionOpInterface filter, but
-// that is always templated on an op.
-struct TileWorkgroups : public linalg::LinalgBaseTilingPattern {
- using Base = linalg::LinalgBaseTilingPattern;
- TileWorkgroups(MLIRContext *context, linalg::LinalgTilingOptions options,
- linalg::LinalgTransformationFilter marker)
- : LinalgBaseTilingPattern(context, options, marker) {}
- LogicalResult matchAndRewrite(Operation *op,
- PatternRewriter &rewriter) const override {
- auto contractionOp = dyn_cast<linalg::ContractionOpInterface>(op);
- if (!contractionOp) return failure();
-
- linalg::TiledLinalgOp tiledLinalgOp;
- if (failed(Base::matchAndRewriteBase(op, rewriter, tiledLinalgOp)) ||
- !tiledLinalgOp.tensorResults.empty()) {
- return failure();
- }
- rewriter.eraseOp(op);
- return success();
- }
-};
-
-} // namespace
-
-namespace {
-struct LLVMCPUVectorizationPass
- : public LLVMCPUVectorizationBase<LLVMCPUVectorizationPass> {
- LLVMCPUVectorizationPass(bool vectorize = true) : lowerToVectors(vectorize) {}
- LLVMCPUVectorizationPass(const LLVMCPUVectorizationPass &pass) {
- lowerToVectors = pass.lowerToVectors;
- }
- void getDependentDialects(DialectRegistry ®istry) const override {
- registry.insert<linalg::LinalgDialect, AffineDialect, scf::SCFDialect,
- vector::VectorDialect>();
- }
- void runOnOperation() override;
-
- private:
- /// TODO(ravishankarm): Option to not generate any `vector.` instructions. The
- /// VMVX backend uses the same lowering as the CPU pass but there is no
- /// lowering of these `vector.` operations to scalar code. So as a WAR do the
- /// same tiling scheme but avoid generating vector instructions. When VMVX can
- /// handle vector instructions, drop this options.
- bool lowerToVectors;
-
- Option<bool> enableVectorContractToAarch64Asm{
- *this, "vector-contract-to-aarch64-asm",
- llvm::cl::desc("Enable promoting wokgroup memory to full tiles allocated "
- "on the stack."),
- llvm::cl::init(false)};
-};
-} // namespace
-
-void LLVMCPUVectorizationPass::runOnOperation() {
- auto funcOp = getOperation();
- MLIRContext *context = &getContext();
-
- // Workgroup first level of tiling.
- {
- // First level of tiling patterns. (workgroups memory)
- RewritePatternSet l1patterns(context);
- l1patterns.insert<TileWorkgroups>(
- context,
- linalg::LinalgTilingOptions().setTileSizeComputationFunction(
- [](OpBuilder &builder, Operation *op) -> SmallVector<Value, 4> {
- return getTileSizes(builder, op,
- static_cast<unsigned>(TilingLevel::L1Tiles));
- }),
- linalg::LinalgTransformationFilter(
- ArrayRef<Identifier>{},
- Identifier::get(getWorkgroupL1TileMarker(), context)));
-
- (void)applyPatternsAndFoldGreedily(funcOp, std::move(l1patterns));
- }
-
- // Second level of tiling. (workgroups memory -> vectors)
- {
- RewritePatternSet l2patterns(context);
- l2patterns.insert<TileWorkgroups>(
- context,
- linalg::LinalgTilingOptions().setTileSizeComputationFunction(
- [](OpBuilder &builder, Operation *op) -> SmallVector<Value, 4> {
- return getTileSizes(
- builder, op, static_cast<unsigned>(TilingLevel::VectorTiles));
- }),
- linalg::LinalgTransformationFilter(
- Identifier::get(getWorkgroupL1TileMarker(), context),
- Identifier::get(getVectorizeMarker(), context)));
-
- (void)applyPatternsAndFoldGreedily(funcOp, std::move(l2patterns));
- }
-
- // Apply canonicalization.
- {
- RewritePatternSet canonicalizationPatterns =
- linalg::getLinalgTilingCanonicalizationPatterns(context);
- populateAffineMinCanonicalizationPattern(canonicalizationPatterns);
- if (failed(applyPatternsAndFoldGreedily(
- funcOp, std::move(canonicalizationPatterns)))) {
- return signalPassFailure();
- }
- }
-
- if (!lowerToVectors) {
- return;
- }
-
- // Op specific conversion.
- {
- RewritePatternSet vectorizeOpsPattenrs(context);
- populateLinalgToVectorVectorizeMMT4dPatterns(context, vectorizeOpsPattenrs);
- if (failed(applyPatternsAndFoldGreedily(funcOp,
- std::move(vectorizeOpsPattenrs)))) {
- return signalPassFailure();
- }
- }
-
- // Apply vectorization patterns.
- {
- RewritePatternSet vectorizationPatterns(context);
- linalg::insertVectorizationPatterns<linalg::ContractionOpInterface,
- linalg::CopyOp, linalg::FillOp>(
- vectorizationPatterns, linalg::LinalgVectorizationOptions(),
- linalg::LinalgTransformationFilter(
- Identifier::get(getVectorizeMarker(), context)));
- vector::populateVectorTransferPermutationMapLoweringPatterns(
- vectorizationPatterns);
- vector::populateVectorReductionToContractPatterns(vectorizationPatterns);
- (void)applyPatternsAndFoldGreedily(funcOp,
- std::move(vectorizationPatterns));
- }
-
- {
- // Fold consumer add ops into the contraction op itself.
- RewritePatternSet canonicalizationPatterns(context);
- vector::ContractionOp::getCanonicalizationPatterns(canonicalizationPatterns,
- context);
- (void)applyPatternsAndFoldGreedily(funcOp,
- std::move(canonicalizationPatterns));
- }
-
- if (enableVectorContractToAarch64Asm) {
- RewritePatternSet vectorToAArch64AsmPatterns(context);
- populateVectorContractToAArch64InlineAsm(vectorToAArch64AsmPatterns,
- context);
- (void)applyPatternsAndFoldGreedily(funcOp,
- std::move(vectorToAArch64AsmPatterns));
- }
-
- // Apply vector specific operation lowering.
- {
- vector::VectorTransformsOptions vectorTransformsOptions =
- vector::VectorTransformsOptions().setVectorTransformsOptions(
- vector::VectorContractLowering::OuterProduct);
- RewritePatternSet vectorContractLoweringPatterns(context);
- vectorContractLoweringPatterns.insert<
- vector::ContractionOpToOuterProductOpLowering,
- vector::ContractionOpToMatmulOpLowering, vector::ContractionOpLowering>(
- vectorTransformsOptions, context);
- vector::populateVectorTransferPermutationMapLoweringPatterns(
- vectorContractLoweringPatterns);
- (void)applyPatternsAndFoldGreedily(
- funcOp, std::move(vectorContractLoweringPatterns));
- }
-
- // Hosit hierarchical tiling indexing and other loop invariant transfer
- // ops computation.
-
- // Programmatic controlled lowering of vector.transfer only.
- {
- VectorTransferToSCFOptions vectorToSCFOptions =
- VectorTransferToSCFOptions().enableFullUnroll();
- RewritePatternSet vectorToLoopsPatterns(context);
- populateVectorToSCFConversionPatterns(vectorToLoopsPatterns,
- vectorToSCFOptions);
- // Hosit hierarchical tiling indexing and other loop invariant transfer
- // ops computation.
- linalg::hoistRedundantVectorTransfers(funcOp);
-
- memref::populateFoldSubViewOpPatterns(vectorToLoopsPatterns);
- (void)applyPatternsAndFoldGreedily(funcOp,
- std::move(vectorToLoopsPatterns));
- }
-}
-
-std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUVectorizationPass(
- bool lowerToVectors) {
- return std::make_unique<LLVMCPUVectorizationPass>(lowerToVectors);
-}
-
-} // namespace iree_compiler
-} // namespace mlir
diff --git a/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 33f3999..cb0a934 100644
--- a/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -7,6 +7,7 @@
#include "iree/compiler/Codegen/Passes.h"
#include "iree-dialects/Dialect/LinalgExt/Transforms/Passes.h"
+#include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
#include "iree/compiler/Codegen/PassDetail.h"
#include "iree/compiler/Dialect/Shape/Transforms/Passes.h"
#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
@@ -26,26 +27,87 @@
return builder.create<memref::AllocaOp>(loc, allocType, dynamicSizes);
}
-void addCPUVectorizationPassPipeline(OpPassManager &passManager,
- bool lowerToVectors) {
- passManager.addPass(createCanonicalizerPass());
+LogicalResult verifyTensorToVectorsPassPipelineConfig(
+ Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
+ IREE::Codegen::TranslationInfoAttr translationInfo,
+ ArrayRef<int64_t> workgroupSize) {
+ if (!workgroupSize.empty()) {
+ return op->emitOpError(
+ "expected workgroup size to be empty for CPU pipelines");
+ }
- // TODO(ataei): This causes segmentation fault on Android. Fix it and
- // re-enable.
- // passManager.addNestedPass<FuncOp>(createPadLinalgWorkgroupTilesPass());
+ // Verify that the translation info is using the right pipeline.
+ auto pipeline =
+ IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors;
+ StringRef pipelineName = stringifyEnum(pipeline);
+ if (translationInfo.getDispatchLoweringPassPipeline() != pipeline) {
+ return op->emitOpError("expected pipeline in translation.info to be ")
+ << pipelineName;
+ }
- // Use stack allocation on CPU side.
- addLinalgBufferizePasses(passManager, cpuAllocationFunction);
- passManager.addNestedPass<FuncOp>(createCSEPass());
- passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
+ // Verify that the workload per workgroup is set and is non-zero.
+ SmallVector<int64_t> workloadPerWorkgroup =
+ translationInfo.getWorkloadPerWorkgroupVals();
+ SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
+ if (workloadPerWorkgroup.size() != partitionedLoops.size()) {
+ return op->emitOpError("expected ")
+ << partitionedLoops.size()
+ << " entries for workload_per_wg, but got "
+ << workloadPerWorkgroup.size();
+ }
+ if (llvm::any_of(workloadPerWorkgroup,
+ [](int64_t val) { return val == 0; })) {
+ return op->emitOpError("invalid to use 0 in workload_per_wg");
+ }
- // Tile and vectorize linalg ops on buffers.
- passManager.addNestedPass<FuncOp>(
- createLLVMCPUVectorizationPass(lowerToVectors));
- passManager.addNestedPass<FuncOp>(createCSEPass());
- passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
+ if (loweringConfig.getTileSizes().size() != 3) {
+ return op->emitOpError("expected three levels of tile sizes for ")
+ << pipelineName << ", got " << loweringConfig.getTileSizes().size();
+ }
+ SmallVector<int64_t> firstLevelTileSizes = loweringConfig.getTileSizeVals(0);
+ if (!firstLevelTileSizes.empty()) {
+ // Verify that if the first-level tile sizes are set, they are the same as
+ // workload_per_wg for the partitioned loops.
+ size_t minElements =
+ (partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1);
+ if (firstLevelTileSizes.size() < minElements) {
+ return op->emitOpError("expected at least ")
+ << minElements
+ << " size for first level tiling to get the distribution fully "
+ "specified.";
+ }
+ llvm::SmallDenseSet<unsigned> partitionedLoopsSet;
+ partitionedLoopsSet.insert(partitionedLoops.begin(),
+ partitionedLoops.end());
+ SmallVector<int64_t> partitionedTileSizes;
+ for (auto tileSize : llvm::enumerate(firstLevelTileSizes)) {
+ if (!partitionedLoopsSet.count(tileSize.index())) {
+ continue;
+ }
+ partitionedTileSizes.push_back(tileSize.value());
+ }
+ for (auto val : llvm::enumerate(llvm::reverse(workloadPerWorkgroup))) {
+ if (val.value() != partitionedTileSizes[val.index()]) {
+ return op->emitOpError("mismatch in distributed tile size value ")
+ << partitionedTileSizes[val.index()] << " at position "
+ << val.index() << " and workload_per_wg value " << val.value();
+ }
+ }
+ }
- passManager.addNestedPass<FuncOp>(createForOpCanonicalizationPass());
+ // Verify that native vector size is either empty, or if set is same as the
+ // last level of tiling
+ SmallVector<int64_t> nativeVectorSize =
+ loweringConfig.getNativeVectorSizeVals();
+ if (!nativeVectorSize.empty()) {
+ if (nativeVectorSize !=
+ loweringConfig.getTileSizeVals(
+ static_cast<unsigned>(TilingLevel::VectorTiles))) {
+ return op->emitOpError(
+ "native_vector_size must be same as the last level of tiling");
+ }
+ }
+ return success();
}
void addTensorToVectorsPassPipeline(OpPassManager &passManager,
diff --git a/iree/compiler/Codegen/LLVMCPU/test/BUILD b/iree/compiler/Codegen/LLVMCPU/test/BUILD
index 0c8c8c6..342b3ca 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/test/BUILD
@@ -22,8 +22,8 @@
"hal_interface_bindings.mlir",
"hal_interface_constants.mlir",
"hal_interface_workgroup_info.mlir",
+ "illegal_configuration.mlir",
"materialize_launch_configuration.mlir",
- "matmul_vectorization.mlir",
"synchronize_symbol_visibility.mlir",
"test_config_mmt4d.mlir",
"tile_and_vectorize.mlir",
diff --git a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index ac58c8a..41f64cb 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -17,8 +17,8 @@
"hal_interface_bindings.mlir"
"hal_interface_constants.mlir"
"hal_interface_workgroup_info.mlir"
+ "illegal_configuration.mlir"
"materialize_launch_configuration.mlir"
- "matmul_vectorization.mlir"
"synchronize_symbol_visibility.mlir"
"test_config_mmt4d.mlir"
"tile_and_vectorize.mlir"
diff --git a/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir
new file mode 100644
index 0000000..57b4cc6
--- /dev/null
+++ b/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir
@@ -0,0 +1,154 @@
+// RUN: iree-opt -pass-pipeline='hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target{test-lowering-configuration=true}))' -verify-diagnostics -split-input-file %s
+
+#config = #iree_codegen.lowering.config<tile_sizes = [], native_vector_size = []>
+#translation = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = []>
+hal.executable private @matmul_tensors {
+ hal.interface @io {
+ hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64", {}> {
+ hal.executable.entry_point @illegal attributes {
+ translation.info = #translation,
+ interface = @io,
+ ordinal = 0 : index
+ }
+ builtin.module {
+ func @illegal() {
+ %c0 = arith.constant 0 : index
+ %lhs = hal.interface.binding.subspan @io::@arg0[%c0] : memref<4x8xf32>
+ %rhs = hal.interface.binding.subspan @io::@arg1[%c0] : memref<8x16xf32>
+ %result = hal.interface.binding.subspan @io::@ret0[%c0] : memref<4x16xf32>
+ // expected-error @+1 {{expected 2 entries for workload_per_wg, but got 0}}
+ linalg.matmul {lowering.config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
+ outs(%result: memref<4x16xf32>)
+ return
+ }
+ }
+ }
+}
+
+// -----
+
+#config = #iree_codegen.lowering.config<tile_sizes = [], native_vector_size = []>
+#translation = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [1, 0]>
+hal.executable private @matmul_tensors {
+ hal.interface @io {
+ hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64", {}> {
+ hal.executable.entry_point @illegal attributes {
+ translation.info = #translation,
+ interface = @io,
+ ordinal = 0 : index
+ }
+ builtin.module {
+ func @illegal() {
+ %c0 = arith.constant 0 : index
+ %lhs = hal.interface.binding.subspan @io::@arg0[%c0] : memref<4x8xf32>
+ %rhs = hal.interface.binding.subspan @io::@arg1[%c0] : memref<8x16xf32>
+ %result = hal.interface.binding.subspan @io::@ret0[%c0] : memref<4x16xf32>
+ // expected-error @+1 {{invalid to use 0 in workload_per_wg}}
+ linalg.matmul {lowering.config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
+ outs(%result: memref<4x16xf32>)
+ return
+ }
+ }
+ }
+}
+
+// -----
+
+#config = #iree_codegen.lowering.config<tile_sizes = [], native_vector_size = []>
+#translation = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [1, 1]>
+hal.executable private @matmul_tensors {
+ hal.interface @io {
+ hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64", {}> {
+ hal.executable.entry_point @illegal attributes {
+ translation.info = #translation,
+ interface = @io,
+ ordinal = 0 : index
+ }
+ builtin.module {
+ func @illegal() {
+ %c0 = arith.constant 0 : index
+ %lhs = hal.interface.binding.subspan @io::@arg0[%c0] : memref<4x8xf32>
+ %rhs = hal.interface.binding.subspan @io::@arg1[%c0] : memref<8x16xf32>
+ %result = hal.interface.binding.subspan @io::@ret0[%c0] : memref<4x16xf32>
+ // expected-error @+1 {{expected three levels of tile sizes for CPUTensorToVectors, got 0}}
+ linalg.matmul {lowering.config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
+ outs(%result: memref<4x16xf32>)
+ return
+ }
+ }
+ }
+}
+
+// -----
+
+#config = #iree_codegen.lowering.config<tile_sizes = [[4, 8], [], []], native_vector_size = []>
+#translation = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [8, 6]>
+hal.executable private @matmul_tensors {
+ hal.interface @io {
+ hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64", {}> {
+ hal.executable.entry_point @illegal attributes {
+ translation.info = #translation,
+ interface = @io,
+ ordinal = 0 : index
+ }
+ builtin.module {
+ func @illegal() {
+ %c0 = arith.constant 0 : index
+ %lhs = hal.interface.binding.subspan @io::@arg0[%c0] : memref<4x8xf32>
+ %rhs = hal.interface.binding.subspan @io::@arg1[%c0] : memref<8x16xf32>
+ %result = hal.interface.binding.subspan @io::@ret0[%c0] : memref<4x16xf32>
+ // expected-error @+1 {{mismatch in distributed tile size value 4 at position 0 and workload_per_wg value 6}}
+ linalg.matmul {lowering.config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
+ outs(%result: memref<4x16xf32>)
+ return
+ }
+ }
+ }
+}
+
+// -----
+
+#config = #iree_codegen.lowering.config<tile_sizes = [[], [], [8, 8, 8]], native_vector_size = [4, 4, 4]>
+#translation = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [8, 4]>
+hal.executable private @matmul_tensors {
+ hal.interface @io {
+ hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+ }
+ hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64", {}> {
+ hal.executable.entry_point @illegal attributes {
+ translation.info = #translation,
+ interface = @io,
+ ordinal = 0 : index
+ }
+ builtin.module {
+ func @illegal() {
+ %c0 = arith.constant 0 : index
+ %lhs = hal.interface.binding.subspan @io::@arg0[%c0] : memref<4x8xf32>
+ %rhs = hal.interface.binding.subspan @io::@arg1[%c0] : memref<8x16xf32>
+ %result = hal.interface.binding.subspan @io::@ret0[%c0] : memref<4x16xf32>
+ // expected-error @+1 {{native_vector_size must be same as the last level of tiling}}
+ linalg.matmul {lowering.config = #config} ins(%lhs, %rhs : memref<4x8xf32>, memref<8x16xf32>)
+ outs(%result: memref<4x16xf32>)
+ return
+ }
+ }
+ }
+}
diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index 11bc34b..a521311 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
@@ -19,16 +19,13 @@
func @matmul_tensors() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
- %pcM = hal.interface.load.constant offset = 0 : index
- %pcN = hal.interface.load.constant offset = 1 : index
- %pcK = hal.interface.load.constant offset = 2 : index
- %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<?x?xf32>{%pcM, %pcK}
- %2 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<?x?xf32>{%pcK, %pcN}
- %4 = hal.interface.binding.subspan @io::@arg2[%c0] : memref<?x?xf32>{%pcM, %pcN}
- %6 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<?x?xf32>{%pcM, %pcN}
- %M = memref.dim %0, %c0 : memref<?x?xf32>
- %N = memref.dim %2, %c1 : memref<?x?xf32>
- %K = memref.dim %0, %c1 : memref<?x?xf32>
+ %M = hal.interface.load.constant offset = 0 : index
+ %N = hal.interface.load.constant offset = 1 : index
+ %K = hal.interface.load.constant offset = 2 : index
+ %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K}
+ %2 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N}
+ %4 = hal.interface.binding.subspan @io::@arg2[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %N}
+ %6 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%M, %N}
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
@@ -42,15 +39,12 @@
%11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_size_x, %workgroup_count_x]
scf.for %arg1 = %10 to %N step %11 {
%12 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %N]
- %13 = memref.subview %0[%arg0, 0] [%12, %K] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
+ %13 = flow.dispatch.tensor.load %0, offsets=[%arg0, 0], sizes=[%12, %K], strides=[1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%14 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %M]
- %15 = memref.subview %2[0, %arg1] [%K, %14] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
- %16 = memref.subview %4[%arg0, %arg1] [%12, %14] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
- %17 = memref.alloc(%12, %14) : memref<?x?xf32>
- linalg.copy(%16, %17) : memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, memref<?x?xf32>
- linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%13, %15 : memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>) outs(%17 : memref<?x?xf32>)
- %18 = memref.subview %6[%arg0, %arg1] [%12, %14] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
- linalg.copy(%17, %18) : memref<?x?xf32>, memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>
+ %15 = flow.dispatch.tensor.load %2, offsets=[0, %arg1], sizes=[%K, %14], strides=[1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
+ %16 = flow.dispatch.tensor.load %4, offsets=[%arg0, %arg1], sizes=[%12, %14], strides=[1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
+ %17 = linalg.matmul ins(%13, %15 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %17, %6, offsets=[%arg0, %arg1], sizes=[%12, %14], strides=[1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
}
}
return
@@ -96,19 +90,23 @@
%c0 = arith.constant 0 : index
%dim0 = hal.interface.load.constant offset = 0 : index
%dim1 = hal.interface.load.constant offset = 1 : index
- %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<?x?xf32>{%dim0, %dim1}
- %1 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<?xf32>{%dim1}
- %2 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<?x?xf32>{%dim0, %dim1}
- linalg.generic {
+ %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%dim0, %dim1}
+ %1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?xf32>{%dim1}
+ %2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%dim0, %dim1}
+ %3 = flow.dispatch.tensor.load %0, offsets=[], sizes=[], strides=[] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets=[], sizes=[], strides=[] : !flow.dispatch.tensor<readonly:?xf32> -> tensor<?xf32>
+ %5 = linalg.init_tensor [%dim0, %dim1] : tensor<?x?xf32>
+ %6 = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d1)>,
affine_map<(d0, d1) -> (d0, d1)>],
iterator_types = ["parallel", "parallel"]}
- ins(%0, %1 : memref<?x?xf32>, memref<?xf32>) outs(%2 : memref<?x?xf32>) {
+ ins(%3, %4 : tensor<?x?xf32>, tensor<?xf32>) outs(%5 : tensor<?x?xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors
- %3 = arith.addf %arg0, %arg1 : f32
- linalg.yield %3 : f32
- }
+ %7 = arith.addf %arg0, %arg1 : f32
+ linalg.yield %7 : f32
+ } -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [], sizes = [], strides = [] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
return
}
hal.interface private @io {
@@ -382,23 +380,22 @@
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[], [1, 32, 32, 32], [1, 4, 4, 4]{{\]}}, native_vector_size = [1, 4, 4, 4]>
// CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 64)>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [64, 64]>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [64, 64, 1]>
// CHECK: hal.executable.entry_point public @batch_matmul_tensors
// CHECK-NEXT: (%[[ARG0:[a-zA-Z0-9]+]]: index
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: index
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index)
-// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[D0:.+]] = affine.apply #[[MAP0]]()[%[[ARG0]]]
// CHECK-DAG: %[[D1:.+]] = affine.apply #[[MAP0]]()[%[[ARG1]]]
-// CHECK: hal.return %[[D0]], %[[D1]], %[[C1]]
+// CHECK: hal.return %[[D0]], %[[D1]], %[[ARG2]]
// CHECK: linalg.batch_matmul
// CHECK-SAME: lowering.config = #[[CONFIG]]
// -----
#compilation = #iree_codegen.compilation.info<
- #iree_codegen.lowering.config<tile_sizes = [[32, 32, 32]], native_vector_size = []>,
- #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [32, 32]>,
+ #iree_codegen.lowering.config<tile_sizes = [[], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]>,
+ #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [32, 32]>,
workgroup_size = []>
hal.executable private @preset_config_matmul_tensors {
hal.executable.variant @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64"> {
@@ -452,10 +449,10 @@
}
}
}
-// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[32, 32, 32]{{\]}}, native_vector_size = []>
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[], [32, 32, 32], [4, 4, 4]{{\]}}, native_vector_size = [4, 4, 4]>
// CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 32)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 * 32)>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [32, 32]>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [32, 32]>
// CHECK: hal.executable.entry_point
// CHECK-SAME: translation.info = #[[TRANSLATION]]
// CHECK-NEXT: ^bb0(%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index
diff --git a/iree/compiler/Codegen/LLVMCPU/test/matmul_vectorization.mlir b/iree/compiler/Codegen/LLVMCPU/test/matmul_vectorization.mlir
deleted file mode 100644
index 80d6213..0000000
--- a/iree/compiler/Codegen/LLVMCPU/test/matmul_vectorization.mlir
+++ /dev/null
@@ -1,143 +0,0 @@
-// RUN: iree-opt -pass-pipeline="hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target{use-lowering-pipeline='builtin.func(iree-llvmcpu-vectorization)'}))" -split-input-file %s | IreeFileCheck %s
-
-#config = #iree_codegen.lowering.config<tile_sizes = [[64, 64], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]>
-hal.executable private @dynamic_matmul {
- hal.interface @io {
- hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
- hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
- hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
- }
- hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64"> {
- hal.executable.entry_point @matmul_128x128x128 attributes {
- interface = @io,
- ordinal = 0 : index
- }
- builtin.module {
- func @matmul_128x128x128() {
- %c0 = arith.constant 0 : index
- %c128 = arith.constant 128 : index
- %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<128x128xf32>
- %1 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<128x128xf32>
- %2 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<128x128xf32>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
- %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
- scf.for %arg0 = %3 to %c128 step %4 {
- %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
- %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
- scf.for %arg1 = %5 to %c128 step %6 {
- %7 = memref.subview %0[%arg0, 0] [64, 128] [1, 1] : memref<128x128xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
- %8 = memref.subview %1[0, %arg1] [128, 64] [1, 1] : memref<128x128xf32> to memref<128x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
- %9 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32> to memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
- linalg.matmul {lowering.config = #config} ins(%7, %8 : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>, memref<128x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>) outs(%9 : memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>)
- }
- }
- return
- }
- }
- }
-}
-// CHECK-LABEL: func @matmul_128x128x128
-// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0
-// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1
-// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0
-// CHECK-DAG: %[[START:.+]] = arith.constant 0 : index
-// CHECK-DAG: %[[WORGKROUP_SIZE:.+]] = arith.constant 64
-// CHECK-DAG: %[[VECTOR_SIZE:.+]] = arith.constant 4
-// CHECK-DAG: %[[L1_SIZE:.+]] = arith.constant 32
-// CHECK-DAG: %[[KDIM_SIZE:.+]] = arith.constant 128
-// CHECK: scf.for
-// CHECK: scf.for
-// CHECK: scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] {
-// CHECK: scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] {
-// CHECK: scf.for {{.*}} = %[[START]] to %[[KDIM_SIZE]] step %[[L1_SIZE]] {
-// CHECK: scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] {
-// CHECK: scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] {
-// CHECK: %[[VEC_C_0:.+]] = vector.transfer_read %[[RET0]]
-// CHECK: %[[VEC_C_1:.+]] = vector.transfer_read %[[RET0]]
-// CHECK: %[[VEC_C_2:.+]] = vector.transfer_read %[[RET0]]
-// CHECK: %[[VEC_C_3:.+]] = vector.transfer_read %[[RET0]]
-// CHECK: scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]]
-// CHECK: %[[VEC_A_0:.+]] = vector.transfer_read %[[ARG0]]
-// CHECK: %[[VEC_A_1:.+]] = vector.transfer_read %[[ARG0]]
-// CHECK: %[[VEC_A_2:.+]] = vector.transfer_read %[[ARG0]]
-// CHECK: %[[VEC_A_3:.+]] = vector.transfer_read %[[ARG0]]
-// CHECK: %[[VEC_B_0:.+]] = vector.transfer_read %[[ARG1]]
-// CHECK: %[[VEC_b_1:.+]] = vector.transfer_read %[[ARG1]]
-// CHECK: %[[VEC_B_2:.+]] = vector.transfer_read %[[ARG1]]
-// CHECK: %[[VEC_B_3:.+]] = vector.transfer_read %[[ARG1]]
-
-// -----
-
-#config = #iree_codegen.lowering.config<tile_sizes = [[64, 64], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]>
-hal.executable private @matmul_i8_i8_i32 {
- hal.interface @io {
- hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
- hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
- hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
- }
- hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64"> {
- hal.executable.entry_point @matmul_i8_i8_i32_128x128x128 attributes {
- interface = @io,
- ordinal = 0 : index
- }
- builtin.module {
- func @matmul_i8_i8_i32_128x128x128() {
- %c0 = arith.constant 0 : index
- %c128 = arith.constant 128 : index
- %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<128x128xi8>
- %1 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<128x128xi8>
- %2 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<128x128xi32>
- %workgroup_id_x = hal.interface.workgroup.id[0] : index
- %workgroup_count_x = hal.interface.workgroup.count[0] : index
- %workgroup_id_y = hal.interface.workgroup.id[1] : index
- %workgroup_count_y = hal.interface.workgroup.count[1] : index
- %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y]
- %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y]
- scf.for %arg0 = %3 to %c128 step %4 {
- %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
- %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
- scf.for %arg1 = %5 to %c128 step %6 {
- %7 = memref.subview %0[%arg0, 0] [64, 128] [1, 1] : memref<128x128xi8> to memref<64x128xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
- %8 = memref.subview %1[0, %arg1] [128, 64] [1, 1] : memref<128x128xi8> to memref<128x64xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
- %9 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32> to memref<64x64xi32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>
- linalg.matmul {lowering.config = #config} ins(%7, %8 : memref<64x128xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>, memref<128x64xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>) outs(%9 : memref<64x64xi32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>)
- }
- }
- return
- }
- }
- }
-}
-// CHECK-LABEL: func @matmul_i8_i8_i32_128x128x128
-// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0
-// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1
-// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0
-// CHECK-DAG: %[[START:.+]] = arith.constant 0 : index
-// CHECK-DAG: %[[WORGKROUP_SIZE:.+]] = arith.constant 64
-// CHECK-DAG: %[[VECTOR_SIZE:.+]] = arith.constant 4
-// CHECK-DAG: %[[L1_SIZE:.+]] = arith.constant 32
-// CHECK-DAG: %[[KDIM_SIZE:.+]] = arith.constant 128
-// CHECK: scf.for
-// CHECK: scf.for
-// CHECK: scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] {
-// CHECK: scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] {
-// CHECK: scf.for {{.*}} = %[[START]] to %[[KDIM_SIZE]] step %[[L1_SIZE]] {
-// CHECK: scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] {
-// CHECK: scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] {
-// CHECK: %[[VEC_C_0:.+]] = vector.transfer_read %[[RET0]]
-// CHECK: %[[VEC_C_1:.+]] = vector.transfer_read %[[RET0]]
-// CHECK: %[[VEC_C_2:.+]] = vector.transfer_read %[[RET0]]
-// CHECK: %[[VEC_C_3:.+]] = vector.transfer_read %[[RET0]]
-// CHECK: scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]]
-// CHECK: %[[VEC_A_0:.+]] = vector.transfer_read %[[ARG0]]
-// CHECK: %[[VEC_A_1:.+]] = vector.transfer_read %[[ARG0]]
-// CHECK: %[[VEC_A_2:.+]] = vector.transfer_read %[[ARG0]]
-// CHECK: %[[VEC_A_3:.+]] = vector.transfer_read %[[ARG0]]
-// CHECK: %[[VEC_B_0:.+]] = vector.transfer_read %[[ARG1]]
-// CHECK: %[[VEC_b_1:.+]] = vector.transfer_read %[[ARG1]]
-// CHECK: %[[VEC_B_2:.+]] = vector.transfer_read %[[ARG1]]
-// CHECK: %[[VEC_B_3:.+]] = vector.transfer_read %[[ARG1]]
diff --git a/iree/compiler/Codegen/Passes.cpp b/iree/compiler/Codegen/Passes.cpp
index 19acfec..ae9e935 100644
--- a/iree/compiler/Codegen/Passes.cpp
+++ b/iree/compiler/Codegen/Passes.cpp
@@ -6,6 +6,8 @@
#include "iree/compiler/Codegen/Passes.h"
+#include "iree/compiler/Codegen/Dialect/LoweringConfig.h"
+
namespace mlir {
namespace iree_compiler {
@@ -48,5 +50,21 @@
});
}
+/// Hook to verify the lowering configuration and translation info for an
+/// operation.
+LogicalResult verifyLoweringConfiguration(
+ Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
+ IREE::Codegen::TranslationInfoAttr translationInfo,
+ ArrayRef<int64_t> workgroupSize) {
+ switch (translationInfo.getDispatchLoweringPassPipeline()) {
+ case IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors:
+ return verifyTensorToVectorsPassPipelineConfig(op, loweringConfig,
+ translationInfo);
+ default:
+ break;
+ }
+ return success();
+}
+
} // namespace iree_compiler
} // namespace mlir
diff --git a/iree/compiler/Codegen/Passes.h b/iree/compiler/Codegen/Passes.h
index dd02ee4..516be98 100644
--- a/iree/compiler/Codegen/Passes.h
+++ b/iree/compiler/Codegen/Passes.h
@@ -9,6 +9,7 @@
#include <memory>
+#include "iree/compiler/Codegen/Dialect/LoweringConfig.h"
#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
#include "mlir/Dialect/Linalg/ComprehensiveBufferize/ComprehensiveBufferize.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
@@ -19,9 +20,15 @@
namespace mlir {
namespace iree_compiler {
-// Registers all conversion passes in this directory.
+/// Registers all conversion passes in this directory.
void registerCodegenPasses();
+/// Verify that the configuration used for compilation is valid.
+LogicalResult verifyLoweringConfiguration(
+ Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
+ IREE::Codegen::TranslationInfoAttr translationInfo,
+ ArrayRef<int64_t> workgroupSize = {});
+
//------------------------------------------------------------------------------
// Misc/common conversions
//------------------------------------------------------------------------------
@@ -184,13 +191,12 @@
/// to memrefs
void addCPUDefaultPassPipeline(OpPassManager &passManager);
-/// Populates the passes needed to lower to vector operations using linalg based
-/// progressive lowering with vectorization after bufferization.
-void addCPUVectorizationPassPipeline(OpPassManager &passManager,
- bool lowerToVectors = true);
-
/// Populates the passes needed to multi level tile and lowering of linalg ops
/// on tensors to vectors operations.
+LogicalResult verifyTensorToVectorsPassPipelineConfig(
+ Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
+ IREE::Codegen::TranslationInfoAttr translationInfo,
+ ArrayRef<int64_t> workgroupSize = {});
void addTensorToVectorsPassPipeline(OpPassManager &passManager,
bool lowerToVectors = true,
bool useTileAndVectorizeV2 = false);
diff --git a/iree/compiler/Codegen/Passes.td b/iree/compiler/Codegen/Passes.td
index ef1570f..0c8f923 100644
--- a/iree/compiler/Codegen/Passes.td
+++ b/iree/compiler/Codegen/Passes.td
@@ -153,12 +153,6 @@
let constructor = "mlir::iree_compiler::createLLVMCPUUnfuseFMAOpsPass()";
}
-def LLVMCPUVectorization :
- Pass<"iree-llvmcpu-vectorization", "FuncOp"> {
- let summary = "Tile and vectorize for CPU backends";
- let constructor = "mlir::iree_compiler::createLLVMCPUVectorizationPass()";
-}
-
def VectorToAArch64InlineAsm :
Pass<"iree-llvmcpu-vector-to-aarch64-inline-asm", "FuncOp"> {
let summary = "Convert vector operations to aarch64 inline asm LLVMIR dialect";
diff --git a/iree/test/e2e/regression/lowering_config.mlir b/iree/test/e2e/regression/lowering_config.mlir
index 17b401f..ab4857d 100644
--- a/iree/test/e2e/regression/lowering_config.mlir
+++ b/iree/test/e2e/regression/lowering_config.mlir
@@ -1,10 +1,10 @@
#compilation0 = #iree_codegen.compilation.info<
- #iree_codegen.lowering.config<tile_sizes = [[32, 32, 32]], native_vector_size = []>,
- #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [32, 32]>,
+ #iree_codegen.lowering.config<tile_sizes = [[], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]>,
+ #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [32, 32]>,
workgroup_size = []>
#compilation1 = #iree_codegen.compilation.info<
- #iree_codegen.lowering.config<tile_sizes = [[64, 64, 64]], native_vector_size = []>,
- #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [64, 64]>,
+ #iree_codegen.lowering.config<tile_sizes = [[], [64, 64, 64], [16, 16, 16]], native_vector_size = [16, 16, 16]>,
+ #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [64, 64]>,
workgroup_size = []>
func @lowering_config_test() {
%a = util.unfoldable_constant dense<1.0> : tensor<128x256xf32>