Remove buffer based pipeline on CPU. (#7665)
diff --git a/iree/compiler/Codegen/Dialect/LoweringConfig.td b/iree/compiler/Codegen/Dialect/LoweringConfig.td index 62124cf..95251a1 100644 --- a/iree/compiler/Codegen/Dialect/LoweringConfig.td +++ b/iree/compiler/Codegen/Dialect/LoweringConfig.td
@@ -12,8 +12,6 @@ // List of pre-existing pipelines for translating executables. def CPU_Default : StrEnumAttrCase<"CPUDefault">; -def CPU_Vectorization - : StrEnumAttrCase<"CPUVectorization">; def CPU_TensorToVectors : StrEnumAttrCase<"CPUTensorToVectors">; def CPU_TileFuseAndVectorize @@ -42,8 +40,8 @@ "DispatchLoweringPassPipeline", "identifier for pass pipeline use to lower dispatch region", [CPU_Default, CPU_TensorToVectors, CPU_TileFuseAndVectorize, - CPU_Vectorization, LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize, - LLVMGPU_MatmulSimt, SPIRV_SimpleDistribute, SPIRV_Vectorize, + LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize, LLVMGPU_MatmulSimt, + SPIRV_SimpleDistribute, SPIRV_Vectorize, SPIRV_VectorizeToCooperativeOps, None]> { let cppNamespace = "::mlir::iree_compiler::IREE::Codegen"; }
diff --git a/iree/compiler/Codegen/LLVMCPU/BUILD b/iree/compiler/Codegen/LLVMCPU/BUILD index 368243a..97c33b6 100644 --- a/iree/compiler/Codegen/LLVMCPU/BUILD +++ b/iree/compiler/Codegen/LLVMCPU/BUILD
@@ -20,7 +20,6 @@ "LLVMCPUTileAndVectorizeLinalgTensorOps.cpp", "LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp", "LLVMCPUUnfuseFMAOps.cpp", - "LLVMCPUVectorization.cpp", "Passes.cpp", "VectorContractToAArch64InlineAsmOp.cpp", ],
diff --git a/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt index 4e7ff7a..d2061de 100644 --- a/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt +++ b/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
@@ -23,7 +23,6 @@ "LLVMCPUTileAndVectorizeLinalgTensorOps.cpp" "LLVMCPUTileFuseAndVectorizeLinalgTensorOps.cpp" "LLVMCPUUnfuseFMAOps.cpp" - "LLVMCPUVectorization.cpp" "Passes.cpp" "VectorContractToAArch64InlineAsmOp.cpp" DEPS
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp index a818279..72078cd 100644 --- a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp +++ b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -183,9 +183,6 @@ case IREE::Codegen::DispatchLoweringPassPipeline::None: addCPUDefaultPassPipeline(nestedModulePM); break; - case IREE::Codegen::DispatchLoweringPassPipeline::CPUVectorization: - addCPUVectorizationPassPipeline(nestedModulePM, lowerToVectors); - break; case IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors: addTensorToVectorsPassPipeline(nestedModulePM, lowerToVectors); break;
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorization.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorization.cpp deleted file mode 100644 index d27153e..0000000 --- a/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorization.cpp +++ /dev/null
@@ -1,223 +0,0 @@ -// Copyright 2020 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h" -#include "iree/compiler/Codegen/PassDetail.h" -#include "iree/compiler/Codegen/Passes.h" -#include "iree/compiler/Codegen/Transforms/Transforms.h" -#include "iree/compiler/Codegen/Utils/MarkerUtils.h" -#include "mlir/Conversion/VectorToSCF/VectorToSCF.h" -#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" -#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h" -#include "mlir/Dialect/Linalg/Transforms/Hoisting.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/MemRef/Transforms/Passes.h" -#include "mlir/Dialect/Utils/StructuredOpsUtils.h" -#include "mlir/Dialect/Vector/VectorTransforms.h" -#include "mlir/IR/AffineExpr.h" -#include "mlir/IR/Matchers.h" -#include "mlir/IR/PatternMatch.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" - -#define DEBUG_TYPE "iree-linalg-to-llvm-tile-and-vectorize" - -namespace mlir { -namespace iree_compiler { - -namespace { -// Could just be linalg::TilingPattern with a ContractionOpInterface filter, but -// that is always templated on an op. -struct TileWorkgroups : public linalg::LinalgBaseTilingPattern { - using Base = linalg::LinalgBaseTilingPattern; - TileWorkgroups(MLIRContext *context, linalg::LinalgTilingOptions options, - linalg::LinalgTransformationFilter marker) - : LinalgBaseTilingPattern(context, options, marker) {} - LogicalResult matchAndRewrite(Operation *op, - PatternRewriter &rewriter) const override { - auto contractionOp = dyn_cast<linalg::ContractionOpInterface>(op); - if (!contractionOp) return failure(); - - linalg::TiledLinalgOp tiledLinalgOp; - if (failed(Base::matchAndRewriteBase(op, rewriter, tiledLinalgOp)) || - !tiledLinalgOp.tensorResults.empty()) { - return failure(); - } - rewriter.eraseOp(op); - return success(); - } -}; - -} // namespace - -namespace { -struct LLVMCPUVectorizationPass - : public LLVMCPUVectorizationBase<LLVMCPUVectorizationPass> { - LLVMCPUVectorizationPass(bool vectorize = true) : lowerToVectors(vectorize) {} - LLVMCPUVectorizationPass(const LLVMCPUVectorizationPass &pass) { - lowerToVectors = pass.lowerToVectors; - } - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert<linalg::LinalgDialect, AffineDialect, scf::SCFDialect, - vector::VectorDialect>(); - } - void runOnOperation() override; - - private: - /// TODO(ravishankarm): Option to not generate any `vector.` instructions. The - /// VMVX backend uses the same lowering as the CPU pass but there is no - /// lowering of these `vector.` operations to scalar code. So as a WAR do the - /// same tiling scheme but avoid generating vector instructions. When VMVX can - /// handle vector instructions, drop this options. - bool lowerToVectors; - - Option<bool> enableVectorContractToAarch64Asm{ - *this, "vector-contract-to-aarch64-asm", - llvm::cl::desc("Enable promoting wokgroup memory to full tiles allocated " - "on the stack."), - llvm::cl::init(false)}; -}; -} // namespace - -void LLVMCPUVectorizationPass::runOnOperation() { - auto funcOp = getOperation(); - MLIRContext *context = &getContext(); - - // Workgroup first level of tiling. - { - // First level of tiling patterns. (workgroups memory) - RewritePatternSet l1patterns(context); - l1patterns.insert<TileWorkgroups>( - context, - linalg::LinalgTilingOptions().setTileSizeComputationFunction( - [](OpBuilder &builder, Operation *op) -> SmallVector<Value, 4> { - return getTileSizes(builder, op, - static_cast<unsigned>(TilingLevel::L1Tiles)); - }), - linalg::LinalgTransformationFilter( - ArrayRef<Identifier>{}, - Identifier::get(getWorkgroupL1TileMarker(), context))); - - (void)applyPatternsAndFoldGreedily(funcOp, std::move(l1patterns)); - } - - // Second level of tiling. (workgroups memory -> vectors) - { - RewritePatternSet l2patterns(context); - l2patterns.insert<TileWorkgroups>( - context, - linalg::LinalgTilingOptions().setTileSizeComputationFunction( - [](OpBuilder &builder, Operation *op) -> SmallVector<Value, 4> { - return getTileSizes( - builder, op, static_cast<unsigned>(TilingLevel::VectorTiles)); - }), - linalg::LinalgTransformationFilter( - Identifier::get(getWorkgroupL1TileMarker(), context), - Identifier::get(getVectorizeMarker(), context))); - - (void)applyPatternsAndFoldGreedily(funcOp, std::move(l2patterns)); - } - - // Apply canonicalization. - { - RewritePatternSet canonicalizationPatterns = - linalg::getLinalgTilingCanonicalizationPatterns(context); - populateAffineMinCanonicalizationPattern(canonicalizationPatterns); - if (failed(applyPatternsAndFoldGreedily( - funcOp, std::move(canonicalizationPatterns)))) { - return signalPassFailure(); - } - } - - if (!lowerToVectors) { - return; - } - - // Op specific conversion. - { - RewritePatternSet vectorizeOpsPattenrs(context); - populateLinalgToVectorVectorizeMMT4dPatterns(context, vectorizeOpsPattenrs); - if (failed(applyPatternsAndFoldGreedily(funcOp, - std::move(vectorizeOpsPattenrs)))) { - return signalPassFailure(); - } - } - - // Apply vectorization patterns. - { - RewritePatternSet vectorizationPatterns(context); - linalg::insertVectorizationPatterns<linalg::ContractionOpInterface, - linalg::CopyOp, linalg::FillOp>( - vectorizationPatterns, linalg::LinalgVectorizationOptions(), - linalg::LinalgTransformationFilter( - Identifier::get(getVectorizeMarker(), context))); - vector::populateVectorTransferPermutationMapLoweringPatterns( - vectorizationPatterns); - vector::populateVectorReductionToContractPatterns(vectorizationPatterns); - (void)applyPatternsAndFoldGreedily(funcOp, - std::move(vectorizationPatterns)); - } - - { - // Fold consumer add ops into the contraction op itself. - RewritePatternSet canonicalizationPatterns(context); - vector::ContractionOp::getCanonicalizationPatterns(canonicalizationPatterns, - context); - (void)applyPatternsAndFoldGreedily(funcOp, - std::move(canonicalizationPatterns)); - } - - if (enableVectorContractToAarch64Asm) { - RewritePatternSet vectorToAArch64AsmPatterns(context); - populateVectorContractToAArch64InlineAsm(vectorToAArch64AsmPatterns, - context); - (void)applyPatternsAndFoldGreedily(funcOp, - std::move(vectorToAArch64AsmPatterns)); - } - - // Apply vector specific operation lowering. - { - vector::VectorTransformsOptions vectorTransformsOptions = - vector::VectorTransformsOptions().setVectorTransformsOptions( - vector::VectorContractLowering::OuterProduct); - RewritePatternSet vectorContractLoweringPatterns(context); - vectorContractLoweringPatterns.insert< - vector::ContractionOpToOuterProductOpLowering, - vector::ContractionOpToMatmulOpLowering, vector::ContractionOpLowering>( - vectorTransformsOptions, context); - vector::populateVectorTransferPermutationMapLoweringPatterns( - vectorContractLoweringPatterns); - (void)applyPatternsAndFoldGreedily( - funcOp, std::move(vectorContractLoweringPatterns)); - } - - // Hosit hierarchical tiling indexing and other loop invariant transfer - // ops computation. - - // Programmatic controlled lowering of vector.transfer only. - { - VectorTransferToSCFOptions vectorToSCFOptions = - VectorTransferToSCFOptions().enableFullUnroll(); - RewritePatternSet vectorToLoopsPatterns(context); - populateVectorToSCFConversionPatterns(vectorToLoopsPatterns, - vectorToSCFOptions); - // Hosit hierarchical tiling indexing and other loop invariant transfer - // ops computation. - linalg::hoistRedundantVectorTransfers(funcOp); - - memref::populateFoldSubViewOpPatterns(vectorToLoopsPatterns); - (void)applyPatternsAndFoldGreedily(funcOp, - std::move(vectorToLoopsPatterns)); - } -} - -std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUVectorizationPass( - bool lowerToVectors) { - return std::make_unique<LLVMCPUVectorizationPass>(lowerToVectors); -} - -} // namespace iree_compiler -} // namespace mlir
diff --git a/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/iree/compiler/Codegen/LLVMCPU/Passes.cpp index 195e3fe..cb0a934 100644 --- a/iree/compiler/Codegen/LLVMCPU/Passes.cpp +++ b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -27,28 +27,6 @@ return builder.create<memref::AllocaOp>(loc, allocType, dynamicSizes); } -void addCPUVectorizationPassPipeline(OpPassManager &passManager, - bool lowerToVectors) { - passManager.addPass(createCanonicalizerPass()); - - // TODO(ataei): This causes segmentation fault on Android. Fix it and - // re-enable. - // passManager.addNestedPass<FuncOp>(createPadLinalgWorkgroupTilesPass()); - - // Use stack allocation on CPU side. - addLinalgBufferizePasses(passManager, cpuAllocationFunction); - passManager.addNestedPass<FuncOp>(createCSEPass()); - passManager.addNestedPass<FuncOp>(createCanonicalizerPass()); - - // Tile and vectorize linalg ops on buffers. - passManager.addNestedPass<FuncOp>( - createLLVMCPUVectorizationPass(lowerToVectors)); - passManager.addNestedPass<FuncOp>(createCSEPass()); - passManager.addNestedPass<FuncOp>(createCanonicalizerPass()); - - passManager.addNestedPass<FuncOp>(createForOpCanonicalizationPass()); -} - LogicalResult verifyTensorToVectorsPassPipelineConfig( Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig, IREE::Codegen::TranslationInfoAttr translationInfo,
diff --git a/iree/compiler/Codegen/LLVMCPU/test/BUILD b/iree/compiler/Codegen/LLVMCPU/test/BUILD index 466d805..342b3ca 100644 --- a/iree/compiler/Codegen/LLVMCPU/test/BUILD +++ b/iree/compiler/Codegen/LLVMCPU/test/BUILD
@@ -24,7 +24,6 @@ "hal_interface_workgroup_info.mlir", "illegal_configuration.mlir", "materialize_launch_configuration.mlir", - "matmul_vectorization.mlir", "synchronize_symbol_visibility.mlir", "test_config_mmt4d.mlir", "tile_and_vectorize.mlir",
diff --git a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt index 985c7e6..41f64cb 100644 --- a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt +++ b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -19,7 +19,6 @@ "hal_interface_workgroup_info.mlir" "illegal_configuration.mlir" "materialize_launch_configuration.mlir" - "matmul_vectorization.mlir" "synchronize_symbol_visibility.mlir" "test_config_mmt4d.mlir" "tile_and_vectorize.mlir"
diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir index 1e7a7ef..a521311 100644 --- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir +++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
@@ -19,16 +19,13 @@ func @matmul_tensors() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %pcM = hal.interface.load.constant offset = 0 : index - %pcN = hal.interface.load.constant offset = 1 : index - %pcK = hal.interface.load.constant offset = 2 : index - %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<?x?xf32>{%pcM, %pcK} - %2 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<?x?xf32>{%pcK, %pcN} - %4 = hal.interface.binding.subspan @io::@arg2[%c0] : memref<?x?xf32>{%pcM, %pcN} - %6 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<?x?xf32>{%pcM, %pcN} - %M = memref.dim %0, %c0 : memref<?x?xf32> - %N = memref.dim %2, %c1 : memref<?x?xf32> - %K = memref.dim %0, %c1 : memref<?x?xf32> + %M = hal.interface.load.constant offset = 0 : index + %N = hal.interface.load.constant offset = 1 : index + %K = hal.interface.load.constant offset = 2 : index + %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K} + %2 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N} + %4 = hal.interface.binding.subspan @io::@arg2[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %N} + %6 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%M, %N} %workgroup_size_x = hal.interface.workgroup.size[0] : index %workgroup_size_y = hal.interface.workgroup.size[1] : index %workgroup_id_x = hal.interface.workgroup.id[0] : index @@ -42,15 +39,12 @@ %11 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_size_x, %workgroup_count_x] scf.for %arg1 = %10 to %N step %11 { %12 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %N] - %13 = memref.subview %0[%arg0, 0] [%12, %K] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> + %13 = flow.dispatch.tensor.load %0, offsets=[%arg0, 0], sizes=[%12, %K], strides=[1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> %14 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %M] - %15 = memref.subview %2[0, %arg1] [%K, %14] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> - %16 = memref.subview %4[%arg0, %arg1] [%12, %14] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> - %17 = memref.alloc(%12, %14) : memref<?x?xf32> - linalg.copy(%16, %17) : memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, memref<?x?xf32> - linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%13, %15 : memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>, memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>) outs(%17 : memref<?x?xf32>) - %18 = memref.subview %6[%arg0, %arg1] [%12, %14] [1, 1] : memref<?x?xf32> to memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> - linalg.copy(%17, %18) : memref<?x?xf32>, memref<?x?xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>> + %15 = flow.dispatch.tensor.load %2, offsets=[0, %arg1], sizes=[%K, %14], strides=[1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> + %16 = flow.dispatch.tensor.load %4, offsets=[%arg0, %arg1], sizes=[%12, %14], strides=[1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> + %17 = linalg.matmul ins(%13, %15 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32> + flow.dispatch.tensor.store %17, %6, offsets=[%arg0, %arg1], sizes=[%12, %14], strides=[1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> } } return @@ -96,19 +90,23 @@ %c0 = arith.constant 0 : index %dim0 = hal.interface.load.constant offset = 0 : index %dim1 = hal.interface.load.constant offset = 1 : index - %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<?x?xf32>{%dim0, %dim1} - %1 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<?xf32>{%dim1} - %2 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<?x?xf32>{%dim0, %dim1} - linalg.generic { + %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%dim0, %dim1} + %1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?xf32>{%dim1} + %2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%dim0, %dim1} + %3 = flow.dispatch.tensor.load %0, offsets=[], sizes=[], strides=[] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> + %4 = flow.dispatch.tensor.load %1, offsets=[], sizes=[], strides=[] : !flow.dispatch.tensor<readonly:?xf32> -> tensor<?xf32> + %5 = linalg.init_tensor [%dim0, %dim1] : tensor<?x?xf32> + %6 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} - ins(%0, %1 : memref<?x?xf32>, memref<?xf32>) outs(%2 : memref<?x?xf32>) { + ins(%3, %4 : tensor<?x?xf32>, tensor<?xf32>) outs(%5 : tensor<?x?xf32>) { ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): // no predecessors - %3 = arith.addf %arg0, %arg1 : f32 - linalg.yield %3 : f32 - } + %7 = arith.addf %arg0, %arg1 : f32 + linalg.yield %7 : f32 + } -> tensor<?x?xf32> + flow.dispatch.tensor.store %6, %2, offsets = [], sizes = [], strides = [] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> return } hal.interface private @io { @@ -396,8 +394,8 @@ // ----- #compilation = #iree_codegen.compilation.info< - #iree_codegen.lowering.config<tile_sizes = [[32, 32, 32]], native_vector_size = []>, - #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [32, 32]>, + #iree_codegen.lowering.config<tile_sizes = [[], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]>, + #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [32, 32]>, workgroup_size = []> hal.executable private @preset_config_matmul_tensors { hal.executable.variant @system_elf_x86_64, target = #hal.executable.target<"llvm", "system-elf-x86_64"> { @@ -451,10 +449,10 @@ } } } -// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[32, 32, 32]{{\]}}, native_vector_size = []> +// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[], [32, 32, 32], [4, 4, 4]{{\]}}, native_vector_size = [4, 4, 4]> // CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 32)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 * 32)> -// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [32, 32]> +// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [32, 32]> // CHECK: hal.executable.entry_point // CHECK-SAME: translation.info = #[[TRANSLATION]] // CHECK-NEXT: ^bb0(%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index
diff --git a/iree/compiler/Codegen/LLVMCPU/test/matmul_vectorization.mlir b/iree/compiler/Codegen/LLVMCPU/test/matmul_vectorization.mlir deleted file mode 100644 index 80d6213..0000000 --- a/iree/compiler/Codegen/LLVMCPU/test/matmul_vectorization.mlir +++ /dev/null
@@ -1,143 +0,0 @@ -// RUN: iree-opt -pass-pipeline="hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target{use-lowering-pipeline='builtin.func(iree-llvmcpu-vectorization)'}))" -split-input-file %s | IreeFileCheck %s - -#config = #iree_codegen.lowering.config<tile_sizes = [[64, 64], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]> -hal.executable private @dynamic_matmul { - hal.interface @io { - hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" - hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read" - hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard" - } - hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64"> { - hal.executable.entry_point @matmul_128x128x128 attributes { - interface = @io, - ordinal = 0 : index - } - builtin.module { - func @matmul_128x128x128() { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<128x128xf32> - %1 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<128x128xf32> - %2 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<128x128xf32> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] - %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] - scf.for %arg0 = %3 to %c128 step %4 { - %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] - %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] - scf.for %arg1 = %5 to %c128 step %6 { - %7 = memref.subview %0[%arg0, 0] [64, 128] [1, 1] : memref<128x128xf32> to memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>> - %8 = memref.subview %1[0, %arg1] [128, 64] [1, 1] : memref<128x128xf32> to memref<128x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>> - %9 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xf32> to memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>> - linalg.matmul {lowering.config = #config} ins(%7, %8 : memref<64x128xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>, memref<128x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>) outs(%9 : memref<64x64xf32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>) - } - } - return - } - } - } -} -// CHECK-LABEL: func @matmul_128x128x128 -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0 -// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1 -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0 -// CHECK-DAG: %[[START:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[WORGKROUP_SIZE:.+]] = arith.constant 64 -// CHECK-DAG: %[[VECTOR_SIZE:.+]] = arith.constant 4 -// CHECK-DAG: %[[L1_SIZE:.+]] = arith.constant 32 -// CHECK-DAG: %[[KDIM_SIZE:.+]] = arith.constant 128 -// CHECK: scf.for -// CHECK: scf.for -// CHECK: scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] { -// CHECK: scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] { -// CHECK: scf.for {{.*}} = %[[START]] to %[[KDIM_SIZE]] step %[[L1_SIZE]] { -// CHECK: scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] { -// CHECK: scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] { -// CHECK: %[[VEC_C_0:.+]] = vector.transfer_read %[[RET0]] -// CHECK: %[[VEC_C_1:.+]] = vector.transfer_read %[[RET0]] -// CHECK: %[[VEC_C_2:.+]] = vector.transfer_read %[[RET0]] -// CHECK: %[[VEC_C_3:.+]] = vector.transfer_read %[[RET0]] -// CHECK: scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] -// CHECK: %[[VEC_A_0:.+]] = vector.transfer_read %[[ARG0]] -// CHECK: %[[VEC_A_1:.+]] = vector.transfer_read %[[ARG0]] -// CHECK: %[[VEC_A_2:.+]] = vector.transfer_read %[[ARG0]] -// CHECK: %[[VEC_A_3:.+]] = vector.transfer_read %[[ARG0]] -// CHECK: %[[VEC_B_0:.+]] = vector.transfer_read %[[ARG1]] -// CHECK: %[[VEC_b_1:.+]] = vector.transfer_read %[[ARG1]] -// CHECK: %[[VEC_B_2:.+]] = vector.transfer_read %[[ARG1]] -// CHECK: %[[VEC_B_3:.+]] = vector.transfer_read %[[ARG1]] - -// ----- - -#config = #iree_codegen.lowering.config<tile_sizes = [[64, 64], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]> -hal.executable private @matmul_i8_i8_i32 { - hal.interface @io { - hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" - hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read" - hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard" - } - hal.executable.variant @llvm, target = #hal.executable.target<"llvm", "embedded-elf-x86_64"> { - hal.executable.entry_point @matmul_i8_i8_i32_128x128x128 attributes { - interface = @io, - ordinal = 0 : index - } - builtin.module { - func @matmul_i8_i8_i32_128x128x128() { - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %0 = hal.interface.binding.subspan @io::@arg0[%c0] : memref<128x128xi8> - %1 = hal.interface.binding.subspan @io::@arg1[%c0] : memref<128x128xi8> - %2 = hal.interface.binding.subspan @io::@ret0[%c0] : memref<128x128xi32> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] - %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_y] - scf.for %arg0 = %3 to %c128 step %4 { - %5 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] - %6 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] - scf.for %arg1 = %5 to %c128 step %6 { - %7 = memref.subview %0[%arg0, 0] [64, 128] [1, 1] : memref<128x128xi8> to memref<64x128xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>> - %8 = memref.subview %1[0, %arg1] [128, 64] [1, 1] : memref<128x128xi8> to memref<128x64xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>> - %9 = memref.subview %2[%arg0, %arg1] [64, 64] [1, 1] : memref<128x128xi32> to memref<64x64xi32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>> - linalg.matmul {lowering.config = #config} ins(%7, %8 : memref<64x128xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>, memref<128x64xi8, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>) outs(%9 : memref<64x64xi32, affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)>>) - } - } - return - } - } - } -} -// CHECK-LABEL: func @matmul_i8_i8_i32_128x128x128 -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0 -// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1 -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0 -// CHECK-DAG: %[[START:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[WORGKROUP_SIZE:.+]] = arith.constant 64 -// CHECK-DAG: %[[VECTOR_SIZE:.+]] = arith.constant 4 -// CHECK-DAG: %[[L1_SIZE:.+]] = arith.constant 32 -// CHECK-DAG: %[[KDIM_SIZE:.+]] = arith.constant 128 -// CHECK: scf.for -// CHECK: scf.for -// CHECK: scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] { -// CHECK: scf.for {{.*}} = %[[START]] to %[[WORGKROUP_SIZE]] step %[[L1_SIZE]] { -// CHECK: scf.for {{.*}} = %[[START]] to %[[KDIM_SIZE]] step %[[L1_SIZE]] { -// CHECK: scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] { -// CHECK: scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] { -// CHECK: %[[VEC_C_0:.+]] = vector.transfer_read %[[RET0]] -// CHECK: %[[VEC_C_1:.+]] = vector.transfer_read %[[RET0]] -// CHECK: %[[VEC_C_2:.+]] = vector.transfer_read %[[RET0]] -// CHECK: %[[VEC_C_3:.+]] = vector.transfer_read %[[RET0]] -// CHECK: scf.for {{.*}} = %[[START]] to %[[L1_SIZE]] step %[[VECTOR_SIZE]] -// CHECK: %[[VEC_A_0:.+]] = vector.transfer_read %[[ARG0]] -// CHECK: %[[VEC_A_1:.+]] = vector.transfer_read %[[ARG0]] -// CHECK: %[[VEC_A_2:.+]] = vector.transfer_read %[[ARG0]] -// CHECK: %[[VEC_A_3:.+]] = vector.transfer_read %[[ARG0]] -// CHECK: %[[VEC_B_0:.+]] = vector.transfer_read %[[ARG1]] -// CHECK: %[[VEC_b_1:.+]] = vector.transfer_read %[[ARG1]] -// CHECK: %[[VEC_B_2:.+]] = vector.transfer_read %[[ARG1]] -// CHECK: %[[VEC_B_3:.+]] = vector.transfer_read %[[ARG1]]
diff --git a/iree/compiler/Codegen/Passes.h b/iree/compiler/Codegen/Passes.h index 06f1f13..516be98 100644 --- a/iree/compiler/Codegen/Passes.h +++ b/iree/compiler/Codegen/Passes.h
@@ -191,11 +191,6 @@ /// to memrefs void addCPUDefaultPassPipeline(OpPassManager &passManager); -/// Populates the passes needed to lower to vector operations using linalg based -/// progressive lowering with vectorization after bufferization. -void addCPUVectorizationPassPipeline(OpPassManager &passManager, - bool lowerToVectors = true); - /// Populates the passes needed to multi level tile and lowering of linalg ops /// on tensors to vectors operations. LogicalResult verifyTensorToVectorsPassPipelineConfig(
diff --git a/iree/compiler/Codegen/Passes.td b/iree/compiler/Codegen/Passes.td index ef1570f..0c8f923 100644 --- a/iree/compiler/Codegen/Passes.td +++ b/iree/compiler/Codegen/Passes.td
@@ -153,12 +153,6 @@ let constructor = "mlir::iree_compiler::createLLVMCPUUnfuseFMAOpsPass()"; } -def LLVMCPUVectorization : - Pass<"iree-llvmcpu-vectorization", "FuncOp"> { - let summary = "Tile and vectorize for CPU backends"; - let constructor = "mlir::iree_compiler::createLLVMCPUVectorizationPass()"; -} - def VectorToAArch64InlineAsm : Pass<"iree-llvmcpu-vector-to-aarch64-inline-asm", "FuncOp"> { let summary = "Convert vector operations to aarch64 inline asm LLVMIR dialect";
diff --git a/iree/test/e2e/regression/lowering_config.mlir b/iree/test/e2e/regression/lowering_config.mlir index 17b401f..ab4857d 100644 --- a/iree/test/e2e/regression/lowering_config.mlir +++ b/iree/test/e2e/regression/lowering_config.mlir
@@ -1,10 +1,10 @@ #compilation0 = #iree_codegen.compilation.info< - #iree_codegen.lowering.config<tile_sizes = [[32, 32, 32]], native_vector_size = []>, - #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [32, 32]>, + #iree_codegen.lowering.config<tile_sizes = [[], [32, 32, 32], [4, 4, 4]], native_vector_size = [4, 4, 4]>, + #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [32, 32]>, workgroup_size = []> #compilation1 = #iree_codegen.compilation.info< - #iree_codegen.lowering.config<tile_sizes = [[64, 64, 64]], native_vector_size = []>, - #iree_codegen.translation.info<"CPUVectorization", workload_per_wg = [64, 64]>, + #iree_codegen.lowering.config<tile_sizes = [[], [64, 64, 64], [16, 16, 16]], native_vector_size = [16, 16, 16]>, + #iree_codegen.translation.info<"CPUTensorToVectors", workload_per_wg = [64, 64]>, workgroup_size = []> func @lowering_config_test() { %a = util.unfoldable_constant dense<1.0> : tensor<128x256xf32>