| // Copyright 2021 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include "iree/compiler/Codegen/LLVMGPU/KernelConfig.h" |
| |
| #include <cstdint> |
| #include <numeric> |
| #include <optional> |
| |
| #include "iree/compiler/Codegen/Common/GPU/GPUHeuristics.h" |
| #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h" |
| #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" |
| #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h" |
| #include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h" |
| #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h" |
| #include "iree/compiler/Codegen/Interfaces/UKernelOpInterface.h" |
| #include "iree/compiler/Codegen/LLVMGPU/Passes.h" |
| #include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h" |
| #include "iree/compiler/Codegen/Utils/GPUUtils.h" |
| #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h" |
| #include "iree/compiler/Codegen/Utils/Utils.h" |
| #include "iree/compiler/Dialect/HAL/IR/HALTypes.h" |
| #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h" |
| #include "iree/compiler/Dialect/LinalgExt/Utils/IndexingUtils.h" |
| #include "iree/compiler/Dialect/LinalgExt/Utils/Utils.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Debug.h" |
| #include "mlir/Analysis/SliceAnalysis.h" |
| #include "mlir/Dialect/Arith/IR/Arith.h" |
| #include "mlir/Dialect/Linalg/IR/Linalg.h" |
| #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" |
| #include "mlir/Dialect/Linalg/Transforms/Transforms.h" |
| #include "mlir/IR/BuiltinAttributes.h" |
| #include "mlir/IR/BuiltinTypeInterfaces.h" |
| #include "mlir/IR/Matchers.h" |
| #include "mlir/IR/OperationSupport.h" |
| #include "mlir/IR/TypeUtilities.h" |
| #include "mlir/IR/Types.h" |
| #include "mlir/IR/Value.h" |
| |
| #define DEBUG_TYPE "iree-llvmgpu-kernel-config" |
| #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") |
| #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") |
| namespace mlir::iree_compiler { |
| |
| llvm::cl::opt<bool> clGPUTestTileAndFuseMatmul( |
| "iree-codegen-llvmgpu-test-tile-and-fuse-matmul", |
| llvm::cl::desc("test the the tile and fuse pipeline for matmul"), |
| llvm::cl::init(false)); |
| |
| llvm::cl::opt<bool> clGPUTestTileAndFuseVectorize( |
| "iree-codegen-llvmgpu-test-tile-and-fuse-vectorize", |
| llvm::cl::desc( |
| "test the tile and fuse pipeline for all supported operations"), |
| llvm::cl::init(false)); |
| |
| llvm::cl::opt<bool> clGPUEnableVectorDistribution( |
| "iree-codegen-llvmgpu-use-vector-distribution", |
| llvm::cl::desc("enable the usage of the vector distribution pipeline"), |
| llvm::cl::init(true)); |
| |
| llvm::cl::opt<bool> clGPUEnableTransformDialectJit( |
| "iree-codegen-llvmgpu-enable-transform-dialect-jit", |
| llvm::cl::desc("enable the usage of the transform dialect JIT"), |
| llvm::cl::init(false)); |
| |
| /// Flag to force using WMMA tensorcore operations. |
| llvm::cl::opt<bool> |
| clGPUUseWMMA("iree-codegen-llvmgpu-use-wmma", |
| llvm::cl::desc("force use of wmma operations for tensorcore"), |
| llvm::cl::init(false)); |
| |
| /// Flag used to toggle using mma.sync vs wmma when targetting tensorcore. |
| llvm::cl::opt<bool> |
| clGPUUseMMASync("iree-codegen-llvmgpu-use-mma-sync", |
| llvm::cl::desc("force use mma sync instead of wmma ops"), |
| llvm::cl::init(false)); |
| |
| llvm::cl::opt<int> clGPUMatmulCThreshold( |
| "iree-codegen-llvmgpu-matmul-c-matrix-threshold", |
| llvm::cl::desc("matmul c matrix element count threshold to be considered " |
| "as small vs. large when deciding MMA schedule"), |
| // TODO: We should get this value from the target's parallelism. |
| llvm::cl::init(512 * 512)); |
| |
| static llvm::cl::opt<bool> clLLVMGPUEnablePrefetch( |
| "iree-llvmgpu-enable-prefetch", |
| llvm::cl::desc("Enable prefetch in the vector distribute pipeline"), |
| llvm::cl::init(false)); |
| |
| namespace { |
| |
| using CodeGenPipeline = IREE::Codegen::DispatchLoweringPassPipeline; |
| |
| // Threshold used to determine whether a matmul dimension is 'very skinny'. |
| constexpr int64_t kVerySkinnyDimThreshold = 4; |
| |
| struct TileWorkgroupSizePair { |
| // How many scalar elements each workgroup should handle along each dimension. |
| std::array<int64_t, 3> tileSize; |
| std::array<int64_t, 3> workgroupSize; |
| int64_t pipelineDepth; |
| }; |
| |
| // Simt codegen does not do software pipelining. |
| constexpr unsigned softwarePipelineDepthSimt = 0; |
| |
| } // namespace |
| |
| bool isROCmBackend(IREE::GPU::TargetAttr target) { |
| return target.getArch().starts_with("gfx"); |
| } |
| |
| static bool needsLoweringConfigPropagation( |
| IREE::Codegen::DispatchLoweringPassPipeline pipeline) { |
| using Pipeline = IREE::Codegen::DispatchLoweringPassPipeline; |
| // Pipelines that do not need propagation of lowering config. |
| Pipeline supportedPipelines[] = {Pipeline::LLVMGPUTileAndFuse, |
| Pipeline::LLVMGPUVectorDistribute, |
| Pipeline::LLVMGPUPadAndVectorDistribute}; |
| return !llvm::is_contained(supportedPipelines, pipeline); |
| } |
| |
| //====---------------------------------------------------------------------===// |
| // Matmul Configuration Helpers |
| //====---------------------------------------------------------------------===// |
| |
| /// Return the best combination of tile size and wg size. It will then used to |
| /// pick the best size aligned with the shape dimension. |
| static SmallVector<TileWorkgroupSizePair> |
| getMatmulConfig(IREE::GPU::TargetAttr target) { |
| SmallVector<TileWorkgroupSizePair> tileSizes; |
| // Pick tile size so that M*K and K*N dividible by wgSize * \*vecSize=*\4. |
| // This way workgroup memory copy don't need to be masked. Once we support |
| // masked load we can get performance out of more configuration. |
| |
| // Make use of the full subgroup when possible. |
| if (target.getPreferredSubgroupSize() == 64) { |
| tileSizes.push_back(TileWorkgroupSizePair({{64, 128, 64}, {64, 16, 1}, 1})); |
| } |
| |
| llvm::append_values(tileSizes, |
| TileWorkgroupSizePair({{32, 128, 32}, {32, 8, 1}, 1}), |
| TileWorkgroupSizePair({{128, 64, 8}, {16, 8, 1}, 1}), |
| TileWorkgroupSizePair({{16, 256, 32}, {64, 2, 1}, 1}), |
| TileWorkgroupSizePair({{8, 32, 32}, {8, 8, 1}, 1}), |
| |
| TileWorkgroupSizePair({{32, 128, 4}, {32, 8, 1}, 1}), |
| TileWorkgroupSizePair({{8, 128, 4}, {32, 1, 1}, 1}), |
| TileWorkgroupSizePair({{16, 64, 4}, {16, 2, 1}, 1}), |
| TileWorkgroupSizePair({{1, 128, 8}, {32, 1, 1}, 1})); |
| return tileSizes; |
| } |
| |
| /// Return the best combination of tile size and wg size when using tensorcore |
| /// operations. |
| static void |
| getTensorCoreConfig(SmallVectorImpl<TileWorkgroupSizePair> &tileSizes, |
| Type elementType, int64_t M, int64_t N, int64_t K) { |
| // Based on early analysis we found that 128x256x32_3 gives acceptable |
| // performance across many of the large matrix sizes for f16 and fp32. This |
| // needs to be refined into a better startegy based on empircal data but this |
| // gives us a quick solution to achieve performance in the right order of |
| // magnitude for large square like cases. |
| int64_t parallelDim = M * N; |
| static constexpr int64_t kLargDimThreashold = 1536; |
| if (elementType.isF16()) { |
| if (parallelDim >= kLargDimThreashold * kLargDimThreashold) { |
| tileSizes.push_back( |
| TileWorkgroupSizePair({{128, 256, 32}, {128, 2, 1}, 3})); |
| } |
| tileSizes.push_back(TileWorkgroupSizePair({{32, 32, 32}, {64, 2, 1}, 4})); |
| } else { |
| if (parallelDim >= kLargDimThreashold * kLargDimThreashold) { |
| tileSizes.push_back( |
| TileWorkgroupSizePair({{128, 256, 16}, {128, 2, 1}, 4})); |
| } |
| llvm::append_values(tileSizes, |
| TileWorkgroupSizePair({{32, 32, 16}, {64, 2, 1}, 4}), |
| TileWorkgroupSizePair({{16, 32, 16}, {64, 1, 1}, 4}), |
| TileWorkgroupSizePair({{32, 16, 16}, {32, 2, 1}, 4}), |
| TileWorkgroupSizePair({{16, 16, 16}, {32, 1, 1}, 4})); |
| } |
| } |
| |
| static bool supportsTensorCore(IREE::GPU::TargetAttr target, |
| linalg::LinalgOp op) { |
| // Limit tensor core pipeline to matmul as not all combinations of transpose |
| // are supported upstream. |
| if (!target.supportsSyncMMAOps()) |
| return false; |
| if (!(isa<linalg::MatmulOp>(op) || isa<linalg::BatchMatmulOp>(op))) { |
| assert(linalg::isaContractionOpInterface(op)); |
| // If this is not a named op matmul check some properties to make sure that |
| // we can map it to tensorcore ops. We should have only mulAdd in the region |
| // and the output map should have no permutation and the last dimension |
| // should be a reduce. |
| Region &body = op->getRegion(0); |
| Region::OpIterator it = body.op_begin(); |
| if (it == body.op_end() || !isa<arith::MulFOp>(*(it++))) |
| return false; |
| if (it == body.op_end() || !isa<arith::AddFOp>(*(it++))) |
| return false; |
| if (it == body.op_end() || !isa<linalg::YieldOp>(*(it++))) |
| return false; |
| AffineMap outputMap = op.getMatchingIndexingMap(op.getDpsInitOperand(0)); |
| if (outputMap.getNumResults() != outputMap.getNumDims() - 1) |
| return false; |
| OpBuilder b(op); |
| for (unsigned i = 0, e = outputMap.getNumResults(); i < e - 1; i++) { |
| if (outputMap.getResult(i) != b.getAffineDimExpr(i)) |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /// Decides which tensorcore operations to use. |
| static CodeGenPipeline getTensorCorePipeline(Type elementType) { |
| // Currently mma.sync is on by default for fp16 only. |
| CodeGenPipeline codegenPipeline = CodeGenPipeline::LLVMGPUMatmulTensorCore; |
| |
| // For F16 and F32 use mmasync by default. |
| if (elementType.isF16() || elementType.isF32()) { |
| codegenPipeline = CodeGenPipeline::LLVMGPUMatmulTensorCoreMmaSync; |
| } |
| |
| // Override the decision based on cl flags. |
| assert(!(clGPUUseWMMA && clGPUUseMMASync) && "incompatible options."); |
| if (clGPUUseMMASync) { |
| codegenPipeline = CodeGenPipeline::LLVMGPUMatmulTensorCoreMmaSync; |
| } |
| if (clGPUUseWMMA) { |
| codegenPipeline = CodeGenPipeline::LLVMGPUMatmulTensorCore; |
| }; |
| return codegenPipeline; |
| } |
| |
| //====---------------------------------------------------------------------===// |
| // Vector Distribution Contraction/Convolution Pipeline Configuration |
| //====---------------------------------------------------------------------===// |
| |
| static LogicalResult |
| setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target, |
| mlir::FunctionOpInterface entryPoint, |
| linalg::LinalgOp op) { |
| if (target.getWgp().getMma().empty()) |
| return failure(); |
| |
| const int64_t targetSubgroupSize = target.getPreferredSubgroupSize(); |
| |
| SmallVector<int64_t, 4> bounds = op.getStaticLoopRanges(); |
| FailureOr<mlir::linalg::ConvolutionDimensions> convolutionDims = |
| mlir::linalg::inferConvolutionDims(op); |
| if (failed(convolutionDims)) { |
| return failure(); |
| } |
| |
| // This strategy turns non-strided/dilated convolution problems into matmul |
| // problems by tiling certain dimensions to 1: |
| // - Batch dimensions (parallel shared by the image and output) |
| // - Filter dimensions (reduction on the filter, and convolved on the image) |
| // - All output image dimensions except the outermost one |
| // |
| // After this, the remaining non-unit dimensions are: |
| // - One output image dimension corresponding to the M dimension of a matmul. |
| // - The output channel dimension, corresponding to the N dimension. |
| // - The input channel dimension, corresponding to the K dimension. |
| |
| // TODO: Relax this condition to strictly alignment requirements. |
| if (convolutionDims->outputChannel.size() < 1 || |
| convolutionDims->inputChannel.size() < 1 || |
| convolutionDims->filterLoop.size() < 1 || |
| convolutionDims->outputImage.size() < 1 || |
| convolutionDims->depth.size() != 0) { |
| return failure(); |
| } |
| |
| auto isAllOnesList = [](ArrayRef<int64_t> list) { |
| return llvm::all_of(list, [](int64_t i) { return i == 1; }); |
| }; |
| |
| // TODO: Support non-unit strides/dilations. |
| if (!isAllOnesList(convolutionDims->strides) || |
| !isAllOnesList(convolutionDims->dilations)) { |
| return failure(); |
| } |
| |
| int64_t mDim = convolutionDims->outputImage.back(); |
| int64_t nDim = convolutionDims->outputChannel.back(); |
| // TODO: Support NCHW convolutions. This is just a matmul_transpose_a, however |
| // the distribution patterns currently do not support that variant. |
| if (mDim > nDim) { |
| return failure(); |
| } |
| int64_t kDim = convolutionDims->inputChannel.back(); |
| |
| Value lhs = op.getDpsInputOperand(0)->get(); |
| Value rhs = op.getDpsInputOperand(1)->get(); |
| Value init = op.getDpsInitOperand(0)->get(); |
| |
| Type lhsElemType = getElementTypeOrSelf(lhs); |
| Type rhsElemType = getElementTypeOrSelf(rhs); |
| Type initElemType = getElementTypeOrSelf(init); |
| |
| GPUMatmulShapeType problem{bounds[mDim], bounds[nDim], bounds[kDim], |
| lhsElemType, rhsElemType, initElemType}; |
| |
| SmallVector<GPUMatmulShapeType> intrinsics; |
| intrinsics.reserve(target.getWgp().getMma().size()); |
| for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) { |
| auto [mSize, nSize, kSize] = mma.getMNKShape(); |
| auto [aType, bType, cType] = mma.getABCElementTypes(); |
| if (mma.getSubgroupSize() != targetSubgroupSize) |
| continue; |
| intrinsics.emplace_back(mSize, nSize, kSize, aType, bType, cType); |
| } |
| if (intrinsics.empty()) |
| return failure(); |
| |
| // Note that the following heuristic seeds are just placeholder values. |
| // We need to clean it up and make it adjusting to different targets. |
| // See https://github.com/iree-org/iree/issues/16341 for details. |
| GPUMMAHeuristicSeeds seeds{/*bestSubgroupCountPerWorkgroup=*/4, |
| /*bestMNTileCountPerSubgroup=*/8, |
| /*bestKTileCountPerSubgroup=*/2}; |
| |
| int64_t maxSharedMemoryBytes = target.getWgp().getMaxWorkgroupMemoryBytes(); |
| |
| // First try to find a schedule with an exactly matching intrinsic. |
| FailureOr<GPUMMASchedule> schedule = deduceMMASchedule( |
| problem, intrinsics, seeds, maxSharedMemoryBytes, targetSubgroupSize); |
| if (failed(schedule)) { |
| // Then try again by allowing upcasting accumulator. |
| schedule = deduceMMASchedule( |
| problem, intrinsics, seeds, maxSharedMemoryBytes, targetSubgroupSize, |
| /*transposedLhs*/ false, /*transposedRhs*/ false, |
| /*canUpcastAcc=*/true); |
| } |
| if (failed(schedule)) { |
| return failure(); |
| } |
| |
| std::array<int64_t, 3> workgroupSize{ |
| schedule->nWarpCount * targetSubgroupSize, schedule->mWarpCount, 1}; |
| |
| SmallVector<int64_t> workgroupTileSizes(op.getNumLoops(), 0); |
| SmallVector<int64_t> reductionTileSizes(op.getNumLoops(), 0); |
| // Tile all batch dimensions with unit size. |
| for (int64_t batch : convolutionDims->batch) { |
| workgroupTileSizes[batch] = 1; |
| } |
| // Tile all output image dimensions with unit size except the last one. |
| for (int64_t oi : llvm::drop_end(convolutionDims->outputImage)) { |
| workgroupTileSizes[oi] = 1; |
| } |
| for (int64_t oc : llvm::drop_end(convolutionDims->outputChannel)) { |
| workgroupTileSizes[oc] = 1; |
| } |
| for (int64_t ic : llvm::drop_end(convolutionDims->inputChannel)) { |
| reductionTileSizes[ic] = 1; |
| } |
| // Compute the M/N dimension tile size by multiply subgroup information. |
| workgroupTileSizes[mDim] = |
| schedule->mWarpCount * schedule->mTileCount * schedule->mSize; |
| workgroupTileSizes[nDim] = |
| schedule->nWarpCount * schedule->nTileCount * schedule->nSize; |
| |
| reductionTileSizes[kDim] = schedule->kTileCount * schedule->kSize; |
| |
| // Tile all filter loop dimensions to 1. |
| for (int64_t filterDim : convolutionDims->filterLoop) { |
| reductionTileSizes[filterDim] = 1; |
| } |
| |
| MLIRContext *context = op.getContext(); |
| Builder b(context); |
| SmallVector<NamedAttribute, 2> attrs; |
| attrs.emplace_back(StringAttr::get(context, "workgroup"), |
| b.getI64ArrayAttr(workgroupTileSizes)); |
| attrs.emplace_back(StringAttr::get(context, "reduction"), |
| b.getI64ArrayAttr(reductionTileSizes)); |
| |
| auto configDict = DictionaryAttr::get(context, attrs); |
| auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict); |
| |
| // Attach the MMA schedule as an attribute to the entry point export function |
| // for later access in the pipeline. |
| SmallVector<NamedAttribute, 1> pipelineAttrs; |
| auto scheduleAttr = IREE::GPU::MMAScheduleAttr::get( |
| context, target.getWgp().getMma()[schedule->index], schedule->mWarpCount, |
| schedule->nWarpCount); |
| pipelineAttrs.emplace_back(StringAttr::get(context, "mma_schedule"), |
| scheduleAttr); |
| |
| // Prefetch shared memory if requested. |
| if (clLLVMGPUEnablePrefetch) { |
| auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get( |
| context, /*prefetchSharedMemory=*/true, |
| /*no_reduce_shared_memory_bank_conflicts=*/false, |
| /*reorder_workgroups_strategy=*/std::nullopt); |
| pipelineAttrs.emplace_back( |
| StringAttr::get(context, |
| IREE::GPU::GPUPipelineOptionsAttr::getDictKeyName()), |
| pipelineOptions); |
| } |
| |
| auto pipelineConfig = DictionaryAttr::get(context, pipelineAttrs); |
| |
| return setOpConfigAndEntryPointFnTranslation( |
| entryPoint, op, loweringConfig, CodeGenPipeline::LLVMGPUVectorDistribute, |
| workgroupSize, targetSubgroupSize, pipelineConfig); |
| } |
| |
| [[maybe_unused]] static void |
| debugPrintContractionInfo(StringRef label, unsigned numLoops, |
| linalg::ContractionDimensions contractionDims, |
| ArrayRef<int64_t> sizes) { |
| ArrayRef<unsigned> dimVals[] = {contractionDims.batch, contractionDims.m, |
| contractionDims.n, contractionDims.k}; |
| std::string dimSymbols(numLoops, '*'); |
| for (auto [idx, val] : llvm::enumerate(dimSymbols)) { |
| for (auto [letter, dim] : llvm::zip_equal(StringRef("bmnk"), dimVals)) |
| if (llvm::is_contained(dim, idx)) |
| val = letter; |
| } |
| DBGS() << "Contraction dims: ["; |
| llvm::interleaveComma(dimSymbols, llvm::dbgs()); |
| llvm::dbgs() << "]\n"; |
| |
| DBGS() << label << ": ["; |
| llvm::interleaveComma(sizes, llvm::dbgs()); |
| llvm::dbgs() << "]\n"; |
| } |
| |
| static LogicalResult |
| setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target, |
| mlir::FunctionOpInterface entryPoint, |
| linalg::LinalgOp op) { |
| if (target.getWgp().getMma().empty()) |
| return failure(); |
| |
| const int64_t targetSubgroupSize = target.getPreferredSubgroupSize(); |
| |
| SmallVector<int64_t, 4> bounds = op.getStaticLoopRanges(); |
| FailureOr<mlir::linalg::ContractionDimensions> contractionDims = |
| mlir::linalg::inferContractionDims(op); |
| assert(succeeded(contractionDims) && "Could not infer contraction dims"); |
| |
| if (contractionDims->k.size() < 1 || contractionDims->m.size() < 1 || |
| contractionDims->n.size() < 1) { |
| return failure(); |
| } |
| |
| LLVM_DEBUG(debugPrintContractionInfo("Problem size", op.getNumLoops(), |
| *contractionDims, bounds)); |
| |
| // For now we are not being smart and trying to reshape dimensions to allow |
| // for better usage of intrinsics, and instead are tiling all dimensions |
| // except the inner most m, n, and k dimensions to 1. |
| int64_t mDim = contractionDims->m.back(); |
| int64_t nDim = contractionDims->n.back(); |
| int64_t kDim = contractionDims->k.back(); |
| |
| // Dynamic dims are expected to be taken care of earlier in the pipeline. |
| if (ShapedType::isDynamic(bounds[mDim]) || |
| ShapedType::isDynamic(bounds[nDim]) || |
| ShapedType::isDynamic(bounds[kDim])) { |
| return failure(); |
| } |
| |
| // Bail out on matvec-like cases. |
| if (bounds[mDim] == 1 || bounds[nDim] == 1) { |
| return failure(); |
| } |
| |
| Value lhs = op.getDpsInputOperand(0)->get(); |
| Value rhs = op.getDpsInputOperand(1)->get(); |
| Value init = op.getDpsInitOperand(0)->get(); |
| |
| Type lhsElemType = getElementTypeOrSelf(lhs); |
| Type rhsElemType = getElementTypeOrSelf(rhs); |
| Type initElemType = getElementTypeOrSelf(init); |
| |
| if (auto lhsOp = lhs.getDefiningOp<linalg::GenericOp>()) { |
| if (IREE::LinalgExt::isBitExtendOp(lhsOp)) |
| lhsElemType = getElementTypeOrSelf(lhsOp.getDpsInputs()[0]); |
| } |
| if (auto rhsOp = rhs.getDefiningOp<linalg::GenericOp>()) { |
| if (IREE::LinalgExt::isBitExtendOp(rhsOp)) |
| rhsElemType = getElementTypeOrSelf(rhsOp.getDpsInputs()[0]); |
| } |
| |
| GPUMatmulShapeType problem{bounds[mDim], bounds[nDim], bounds[kDim], |
| lhsElemType, rhsElemType, initElemType}; |
| |
| SmallVector<GPUMatmulShapeType> intrinsics; |
| intrinsics.reserve(target.getWgp().getMma().size()); |
| for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) { |
| auto [mSize, nSize, kSize] = mma.getMNKShape(); |
| auto [aType, bType, cType] = mma.getABCElementTypes(); |
| if (mma.getSubgroupSize() != targetSubgroupSize) |
| continue; |
| intrinsics.emplace_back(mSize, nSize, kSize, aType, bType, cType); |
| } |
| if (intrinsics.empty()) |
| return failure(); |
| |
| GPUMMAHeuristicSeeds seeds; |
| |
| // Note that the following heuristic seeds are just placeholder values. |
| // We need to clean it up and make it adjusting to different targets. |
| // See https://github.com/iree-org/iree/issues/16341 for details. |
| if (problem.mSize * problem.nSize <= clGPUMatmulCThreshold) { |
| // For matmuls with small M*N size, we want to distribute M*N onto more |
| // workgroups to fill the GPU. Use a smaller bestMNTileCountPerSubgroup |
| // and a larger bestKTileCountPerSubgroup. |
| seeds = {/*bestSubgroupCountPerWorkgroup=*/4, |
| /*bestMNTileCountPerSubgroup=*/4, |
| /*bestKTileCountPerSubgroup=*/8}; |
| } else { |
| seeds = {/*bestSubgroupCountPerWorkgroup=*/4, |
| /*bestMNTileCountPerSubgroup=*/8, |
| /*bestKTileCountPerSubgroup=*/4}; |
| } |
| |
| int64_t maxSharedMemoryBytes = target.getWgp().getMaxWorkgroupMemoryBytes(); |
| |
| LDBG("Matmul Vector Distribution Config"); |
| |
| auto pipeline = CodeGenPipeline::LLVMGPUVectorDistribute; |
| |
| // Infer if lhs or rhs is transposed to help generate better schedule. |
| SmallVector<AffineMap> maps = op.getIndexingMapsArray(); |
| bool transposedLhs = |
| kDim != |
| llvm::cast<AffineDimExpr>(maps[0].getResults().back()).getPosition(); |
| bool transposedRhs = |
| nDim != |
| llvm::cast<AffineDimExpr>(maps[1].getResults().back()).getPosition(); |
| |
| // First try to find a schedule with an exactly matching intrinsic. |
| std::optional<GPUMMASchedule> schedule = deduceMMASchedule( |
| problem, intrinsics, seeds, maxSharedMemoryBytes, targetSubgroupSize); |
| if (!schedule) { |
| // Then try again by allowing upcasting accumulator. |
| schedule = |
| deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes, |
| targetSubgroupSize, transposedLhs, transposedRhs, |
| /*canUpcastAcc=*/true); |
| } |
| |
| // Only batch_matmul is supported in the LLVMGPUPadAndVectorDistribute |
| // pipeline. |
| // TODO(hanchung): Support cases that there are fused producers. |
| if (!schedule && !contractionDims->batch.empty() && !hasFusedLeadingOp(op)) { |
| LDBG("Matmul Pad and Vector Distribute"); |
| pipeline = CodeGenPipeline::LLVMGPUPadAndVectorDistribute; |
| bool mustBeAligned = false; |
| schedule = |
| deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes, |
| targetSubgroupSize, transposedLhs, transposedRhs, |
| /*canUpcastAcc=*/false, mustBeAligned); |
| if (!schedule) { |
| // Then try again by allowing upcasting accumulator. |
| schedule = |
| deduceMMASchedule(problem, intrinsics, seeds, maxSharedMemoryBytes, |
| targetSubgroupSize, transposedLhs, transposedRhs, |
| /*canUpcastAcc=*/true, mustBeAligned); |
| } |
| } |
| if (!schedule) { |
| LDBG("Failed to deduce MMA schedule"); |
| return failure(); |
| } |
| |
| LDBG("Target Subgroup size: " << targetSubgroupSize); |
| LDBG("Schedule: sizes [" << schedule->mSize << ", " << schedule->nSize << ", " |
| << schedule->kSize << "]"); |
| LDBG("Schedule: tile counts [" << schedule->mTileCount << ", " |
| << schedule->nTileCount << ", " |
| << schedule->kTileCount << "]"); |
| LDBG("Schedule: warp counts [" << schedule->mWarpCount << ", " |
| << schedule->nWarpCount << "]"); |
| |
| std::array<int64_t, 3> workgroupSize{ |
| schedule->nWarpCount * targetSubgroupSize, schedule->mWarpCount, 1}; |
| |
| SmallVector<int64_t> workgroupTileSizes(op.getNumLoops(), 0); |
| SmallVector<int64_t> reductionTileSizes(op.getNumLoops(), 0); |
| // Tile all batch dimensions with unit size. |
| for (int64_t batch : contractionDims->batch) { |
| workgroupTileSizes[batch] = 1; |
| } |
| |
| // Tile all m, n, and k dimensions to 1 except the innermost. Unit dims |
| // from this tiling are folded before vectorization. |
| for (int64_t m : llvm::drop_end(contractionDims->m)) { |
| workgroupTileSizes[m] = 1; |
| } |
| for (int64_t n : llvm::drop_end(contractionDims->n)) { |
| workgroupTileSizes[n] = 1; |
| } |
| for (int64_t k : llvm::drop_end(contractionDims->k)) { |
| reductionTileSizes[k] = 1; |
| } |
| |
| // Compute the M/N dimension tile size by multiply subgroup information. |
| workgroupTileSizes[mDim] = |
| schedule->mWarpCount * schedule->mTileCount * schedule->mSize; |
| workgroupTileSizes[nDim] = |
| schedule->nWarpCount * schedule->nTileCount * schedule->nSize; |
| |
| reductionTileSizes[kDim] = schedule->kTileCount * schedule->kSize; |
| |
| LLVM_DEBUG(debugPrintContractionInfo("Workgroup tile sizes", op.getNumLoops(), |
| *contractionDims, workgroupTileSizes)); |
| LLVM_DEBUG(debugPrintContractionInfo("Reduction tile sizes", op.getNumLoops(), |
| *contractionDims, reductionTileSizes)); |
| |
| MLIRContext *context = op.getContext(); |
| Builder b(context); |
| SmallVector<NamedAttribute, 2> attrs; |
| attrs.emplace_back(StringAttr::get(context, "workgroup"), |
| b.getI64ArrayAttr(workgroupTileSizes)); |
| attrs.emplace_back(StringAttr::get(context, "reduction"), |
| b.getI64ArrayAttr(reductionTileSizes)); |
| |
| auto configDict = DictionaryAttr::get(context, attrs); |
| auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict); |
| |
| // Attach the MMA schedule as an attribute to the entry point export function |
| // for later access in the pipeline. |
| SmallVector<NamedAttribute, 1> pipelineAttrs; |
| auto scheduleAttr = IREE::GPU::MMAScheduleAttr::get( |
| context, target.getWgp().getMma()[schedule->index], schedule->mWarpCount, |
| schedule->nWarpCount); |
| pipelineAttrs.emplace_back(StringAttr::get(context, "mma_schedule"), |
| scheduleAttr); |
| |
| // Prefetch shared memory if requested. |
| if (clLLVMGPUEnablePrefetch) { |
| auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get( |
| context, /*prefetchSharedMemory=*/true, |
| /*no_reduce_shared_memory_bank_conflicts=*/false, |
| /*reorder_workgroups_strategy=*/std::nullopt); |
| pipelineAttrs.emplace_back( |
| StringAttr::get(context, |
| IREE::GPU::GPUPipelineOptionsAttr::getDictKeyName()), |
| pipelineOptions); |
| } |
| |
| auto pipelineConfig = DictionaryAttr::get(context, pipelineAttrs); |
| |
| return setOpConfigAndEntryPointFnTranslation( |
| entryPoint, op, loweringConfig, pipeline, workgroupSize, |
| targetSubgroupSize, pipelineConfig); |
| } |
| |
| static LogicalResult |
| setAttentionVectorDistributionConfig(IREE::GPU::TargetAttr target, |
| mlir::FunctionOpInterface entryPoint, |
| IREE::LinalgExt::AttentionOp op) { |
| if (target.getWgp().getMma().empty()) |
| return failure(); |
| |
| const int64_t targetSubgroupSize = target.getPreferredSubgroupSize(); |
| |
| // Get iteration domain bounds. |
| OpBuilder b(op); |
| SmallVector<int64_t, 4> bounds = op.getStaticLoopRanges(); |
| |
| auto opInfo = |
| IREE::LinalgExt::AttentionOpDetail::get(op.getIndexingMapsArray()) |
| .value(); |
| |
| int64_t mDim = opInfo.getMDims().back(); |
| int64_t k1Dim = opInfo.getK1Dims().back(); |
| int64_t k2Dim = opInfo.getK2Dims().back(); |
| int64_t nDim = opInfo.getNDims().back(); |
| |
| // Dynamic dims are expected to be taken care of earlier in the pipeline. |
| if (ShapedType::isDynamic(bounds[mDim]) || |
| ShapedType::isDynamic(bounds[k1Dim]) || |
| ShapedType::isDynamic(bounds[k2Dim]) || |
| ShapedType::isDynamic(bounds[nDim])) { |
| return failure(); |
| } |
| |
| // TODO: Do we need a matvec-like attention pipeline? Probably not, |
| // considering M is generally the largest dimension. |
| |
| Value qMatrix = op.getQuery(); |
| Value kMatrix = op.getKey(); |
| Value vMatrix = op.getValue(); |
| |
| SmallVector<GPUMatmulShapeType> intrinsics; |
| intrinsics.reserve(target.getWgp().getMma().size()); |
| for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) { |
| auto [mSize, nSize, kSize] = mma.getMNKShape(); |
| auto [aType, bType, cType] = mma.getABCElementTypes(); |
| if (mma.getSubgroupSize() != targetSubgroupSize) |
| continue; |
| intrinsics.emplace_back(mSize, nSize, kSize, aType, bType, cType); |
| } |
| if (intrinsics.empty()) |
| return failure(); |
| |
| // We assume that P uses the element type of V for input |
| // and both matmuls have f32 as output. It is possible to use other element |
| // types also. |
| Type qElementType = getElementTypeOrSelf(qMatrix); |
| Type kElementType = getElementTypeOrSelf(kMatrix); |
| Type vElementType = getElementTypeOrSelf(vMatrix); |
| Type f32Type = b.getF32Type(); |
| GPUMatmulShapeType qkMatmul{ |
| /*m=*/bounds[mDim], |
| /*n=*/bounds[k2Dim], |
| /*k=*/bounds[k1Dim], |
| /*lhsType=*/qElementType, |
| /*rhsType=*/kElementType, |
| /*accType=*/f32Type, |
| }; |
| GPUMatmulShapeType pvMatmul{/*m=*/bounds[mDim], |
| /*n=*/bounds[nDim], |
| /*k=*/bounds[k2Dim], |
| /*lhsType=*/vElementType, |
| /*rhsType=*/vElementType, |
| /*accType=*/f32Type}; |
| |
| // TODO: Currently, we are forcing number of subgroups to be 1. This can be |
| // fixed by teaching vector distribution chained matmul. |
| GPUMMAHeuristicSeeds pvMatmulSeeds = {/*bestSubgroupCountPerWorkgroup=*/4, |
| /*bestMNTileCountPerSubgroup=*/4, |
| /*bestKTileCountPerSubgroup=*/4}; |
| |
| LDBG("Attention Vector Distribution Config"); |
| |
| // Infer if Q, K and V are transposed to help generate better schedule. |
| bool transposedQ = |
| k1Dim != llvm::cast<AffineDimExpr>(op.getQueryMap().getResults().back()) |
| .getPosition(); |
| bool transposedK = |
| k1Dim != llvm::cast<AffineDimExpr>(op.getKeyMap().getResults().back()) |
| .getPosition(); |
| bool transposedV = |
| k2Dim != llvm::cast<AffineDimExpr>(op.getValueMap().getResults().back()) |
| .getPosition(); |
| |
| int64_t maxSharedMemoryBytes = target.getWgp().getMaxWorkgroupMemoryBytes(); |
| |
| // First try to find a schedule with an exactly matching intrinsic. |
| std::optional<GPUMMASchedule> schedule = deduceAttentionSchedule( |
| qkMatmul, pvMatmul, intrinsics, pvMatmulSeeds, maxSharedMemoryBytes, |
| targetSubgroupSize, transposedQ, transposedK, transposedV); |
| if (!schedule) { |
| // Then try again by allowing upcasting accumulator. |
| schedule = deduceAttentionSchedule( |
| qkMatmul, pvMatmul, intrinsics, pvMatmulSeeds, maxSharedMemoryBytes, |
| targetSubgroupSize, transposedQ, transposedK, transposedV, |
| /*canUpcastAcc=*/true); |
| } |
| |
| if (!schedule) { |
| LDBG("Failed to deduce Attention schedule"); |
| return failure(); |
| } |
| |
| // TODO: Due to a bug in layout configuration, we cannot set warp count on |
| // the N dimension. This is however ok, because we generally do not want to |
| // distribute subgroups on N dimension anyway. |
| if (schedule->nWarpCount != 1) { |
| schedule->nTileCount *= schedule->nWarpCount; |
| schedule->nWarpCount = 1; |
| } |
| |
| LDBG("Target Subgroup size: " << targetSubgroupSize); |
| LDBG("Schedule: sizes [" << schedule->mSize << ", " << schedule->nSize << ", " |
| << schedule->kSize << "]"); |
| LDBG("Schedule: tile counts [" << schedule->mTileCount << ", " |
| << schedule->nTileCount << ", " |
| << schedule->kTileCount << "]"); |
| LDBG("Schedule: warp counts [" << schedule->mWarpCount << ", " |
| << schedule->nWarpCount << "]"); |
| |
| std::array<int64_t, 3> workgroupSize{ |
| schedule->nWarpCount * targetSubgroupSize, schedule->mWarpCount, 1}; |
| |
| SmallVector<int64_t> workgroupTileSizes(opInfo.getDomainRank(), 0); |
| SmallVector<int64_t> reductionTileSizes(op.getNumLoops(), 0); |
| // Tile all batch dimensions with unit size. |
| for (int64_t batch : opInfo.getBatchDims()) { |
| workgroupTileSizes[batch] = 1; |
| } |
| |
| // Tile all m, n, and k2 dimensions to 1 except the innermost. Unit dims |
| // from this tiling are folded before vectorization. k1 dimension cannot be |
| // tiled, so we leave it. |
| for (int64_t m : llvm::drop_end(opInfo.getMDims())) { |
| workgroupTileSizes[m] = 1; |
| } |
| for (int64_t n : llvm::drop_end(opInfo.getNDims())) { |
| workgroupTileSizes[n] = 1; |
| } |
| for (int64_t k2 : llvm::drop_end(opInfo.getK2Dims())) { |
| reductionTileSizes[k2] = 1; |
| } |
| |
| // Compute the M/N dimension tile size by multiply subgroup information. |
| workgroupTileSizes[mDim] = |
| schedule->mWarpCount * schedule->mTileCount * schedule->mSize; |
| workgroupTileSizes[nDim] = |
| schedule->nWarpCount * schedule->nTileCount * schedule->nSize; |
| |
| reductionTileSizes[k2Dim] = schedule->kTileCount * schedule->kSize; |
| |
| MLIRContext *context = op.getContext(); |
| SmallVector<NamedAttribute, 2> attrs; |
| attrs.emplace_back(StringAttr::get(context, "workgroup"), |
| b.getI64ArrayAttr(workgroupTileSizes)); |
| attrs.emplace_back(StringAttr::get(context, "reduction"), |
| b.getI64ArrayAttr(reductionTileSizes)); |
| |
| auto configDict = DictionaryAttr::get(context, attrs); |
| auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict); |
| |
| // Attach the MMA schedule as an attribute to the entry point export function |
| // for later access in the pipeline. |
| SmallVector<NamedAttribute, 1> pipelineAttrs; |
| auto scheduleAttr = IREE::GPU::MMAScheduleAttr::get( |
| context, target.getWgp().getMma()[schedule->index], schedule->mWarpCount, |
| schedule->nWarpCount); |
| pipelineAttrs.emplace_back(StringAttr::get(context, "mma_schedule"), |
| scheduleAttr); |
| |
| // TODO: We do not turn prefetching on even when requested by the prefetching |
| // flag because there is a shared memory allocation the two matmuls, which |
| // the prefetching pass cannot understand. |
| |
| auto pipelineConfig = DictionaryAttr::get(context, pipelineAttrs); |
| |
| return setOpConfigAndEntryPointFnTranslation( |
| entryPoint, op, loweringConfig, CodeGenPipeline::LLVMGPUVectorDistribute, |
| workgroupSize, targetSubgroupSize, pipelineConfig); |
| } |
| |
| static LogicalResult |
| setVectorDistributionConfig(IREE::GPU::TargetAttr target, |
| mlir::FunctionOpInterface entryPoint, |
| Operation *computeOp) { |
| // We haven't properly plumbed through MMA op layouts and conversions for CUDA |
| // to target NVIDIA GPUs. So disable the vector distribution pass for it. |
| if (!isROCmBackend(target)) |
| return failure(); |
| |
| if (!clGPUEnableVectorDistribution) { |
| LDBG("Vector Distribution not enabled, skipping..."); |
| return failure(); |
| } |
| |
| LDBG("VectorDistribution: finding a suitable config..."); |
| |
| if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) { |
| if (linalg::isaContractionOpInterface(linalgOp)) { |
| LDBG("VectorDistribution: trying to find a suitable contraction config"); |
| return setMatmulVectorDistributionConfig(target, entryPoint, linalgOp); |
| } |
| if (linalg::isaConvolutionOpInterface(linalgOp)) { |
| LDBG("VectorDistribution: trying to find a suitable convolution config"); |
| return setConvolutionVectorDistributionConfig(target, entryPoint, |
| linalgOp); |
| } |
| } |
| |
| if (auto attnOp = dyn_cast<IREE::LinalgExt::AttentionOp>(computeOp)) { |
| LDBG("VectorDistribution: trying to find a suitable attention config"); |
| return setAttentionVectorDistributionConfig(target, entryPoint, attnOp); |
| } |
| |
| LDBG("VectorDistribution: failed to find a suitable config"); |
| return failure(); |
| } |
| |
| //====---------------------------------------------------------------------===// |
| // Contraction Pipeline Configuration |
| //====---------------------------------------------------------------------===// |
| |
| static LogicalResult setContractConfig(IREE::GPU::TargetAttr target, |
| mlir::FunctionOpInterface entryPoint, |
| linalg::LinalgOp op) { |
| if (!linalg::isaContractionOpInterface(op) || op.getNumParallelLoops() < 2) { |
| return failure(); |
| } |
| |
| // Also exclude the case of matvec, which has only one non-unit parallel dim. |
| // They should go down different pipelines. |
| // Currently dynamic dimensions are tiled with size=1 in codegen. |
| int staticNonUnitParallelDimCount = 0; |
| SmallVector<int64_t, 4> bounds = op.getStaticLoopRanges(); |
| FailureOr<mlir::linalg::ContractionDimensions> contractionDims = |
| mlir::linalg::inferContractionDims(op); |
| assert(succeeded(contractionDims) && "Could not infer contraction dims"); |
| for (auto mDim : contractionDims->m) { |
| staticNonUnitParallelDimCount += |
| bounds[mDim] != 1 && !ShapedType::isDynamic(bounds[mDim]); |
| } |
| for (auto nDim : contractionDims->n) { |
| staticNonUnitParallelDimCount += |
| bounds[nDim] != 1 && !ShapedType::isDynamic(bounds[nDim]); |
| } |
| if (staticNonUnitParallelDimCount <= 1) |
| return failure(); |
| |
| // Don't consider operations that don't have a broadcast, those should go |
| // through reductions. |
| if (llvm::any_of(op.getIndexingMapsArray(), |
| [](AffineMap m) { return m.isPermutation(); })) { |
| return failure(); |
| } |
| |
| // Send very skinny, {2-4}xNxK and Mx{2-4}xK, matmuls to the vector reduction |
| // pipeline, similar to matvec. Note: Because of reassociation in the vector |
| // reduction pipeline, this may lead to precission loss. If this ever becomes |
| // an issue, we can hide this behind a flag. |
| if (llvm::all_equal({contractionDims->m.size(), contractionDims->n.size(), |
| contractionDims->k.size(), size_t{1}}) && |
| contractionDims->batch.empty()) { |
| int64_t mSize = bounds[contractionDims->m.front()]; |
| int64_t nSize = bounds[contractionDims->n.front()]; |
| int64_t preferredSubgroupSize = target.getPreferredSubgroupSize(); |
| if ((mSize <= kVerySkinnyDimThreshold && |
| (nSize > preferredSubgroupSize || ShapedType::isDynamic(nSize))) || |
| (nSize <= kVerySkinnyDimThreshold && |
| (mSize > preferredSubgroupSize || ShapedType::isDynamic(mSize)))) { |
| return failure(); |
| } |
| } |
| |
| // TODO: Properly rematerialize leading elementwise with shared memory |
| // promotion. |
| if (hasFusedLeadingOp(op)) { |
| return failure(); |
| } |
| |
| auto setMatmulConfig = [&entryPoint, &op](int64_t tileX, int64_t tileY, |
| int64_t tileK, |
| ArrayRef<int64_t> workgroupSize, |
| ArrayRef<int32_t> subgroupSizes, |
| unsigned softwarePipelineDepth, |
| CodeGenPipeline pipeline) { |
| TileSizesListType tileSizes; |
| unsigned numParallelLoops = op.getNumParallelLoops(); |
| SmallVector<int64_t> workgroupTileSizes(numParallelLoops - 2, 1); |
| workgroupTileSizes.append({tileX, tileY}); |
| workgroupTileSizes.append(op.getNumReductionLoops(), tileK); |
| |
| SmallVector<unsigned> partitionedLoops = |
| cast<PartitionableLoopsInterface>(op.getOperation()) |
| .getPartitionableLoops(/*maxNumPartitionedLoops=*/std::nullopt); |
| llvm::SmallDenseSet<unsigned, 4> partitionedLoopsSet; |
| partitionedLoopsSet.insert(partitionedLoops.begin(), |
| partitionedLoops.end()); |
| for (auto loopID : llvm::seq<unsigned>(0, numParallelLoops)) { |
| if (!partitionedLoopsSet.count(loopID)) { |
| workgroupTileSizes[loopID] = 0; |
| } |
| } |
| |
| tileSizes.emplace_back(std::move(workgroupTileSizes)); // Workgroup level. |
| std::optional<int64_t> subgroupSize = std::nullopt; |
| if (!subgroupSizes.empty()) |
| subgroupSize = subgroupSizes.front(); |
| |
| return setOpConfigAndEntryPointFnTranslation( |
| entryPoint, op, tileSizes, pipeline, workgroupSize, subgroupSize, |
| getSoftwarePipeliningAttrDict(op->getContext(), softwarePipelineDepth, |
| /*softwarePipelineStoreStage=*/1)); |
| }; |
| // Infer the MxN size of the matmul based on operands and indexing maps. |
| auto lhsShape = |
| llvm::cast<ShapedType>(op.getDpsInputOperand(0)->get().getType()) |
| .getShape(); |
| auto rhsShape = |
| llvm::cast<ShapedType>(op.getDpsInputOperand(1)->get().getType()) |
| .getShape(); |
| int64_t sizeM = ShapedType::kDynamic; |
| int64_t sizeN = ShapedType::kDynamic; |
| int64_t sizeK = ShapedType::kDynamic; |
| auto outputMap = op.getMatchingIndexingMap(op.getDpsInitOperand(0)); |
| for (unsigned i = 0; i < lhsShape.size(); i++) { |
| if (op.getMatchingIndexingMap(op.getDpsInputOperand(0)).getDimPosition(i) == |
| outputMap.getDimPosition(outputMap.getNumResults() - 2)) { |
| sizeM = lhsShape[i]; |
| break; |
| } |
| } |
| for (unsigned i = 0; i < rhsShape.size(); i++) { |
| if (op.getMatchingIndexingMap(op.getDpsInputOperand(1)).getDimPosition(i) == |
| outputMap.getDimPosition(outputMap.getNumResults() - 1)) { |
| sizeN = rhsShape[i]; |
| break; |
| } |
| } |
| SmallVector<unsigned> exprs; |
| op.getReductionDims(exprs); |
| if (exprs.size() == 1) { |
| for (unsigned i = 0; i < lhsShape.size(); i++) { |
| if (op.getMatchingIndexingMap(op.getDpsInputOperand(0)) |
| .getDimPosition(i) == exprs[0]) { |
| sizeK = lhsShape[i]; |
| break; |
| } |
| } |
| } |
| bool isStaticSize = !ShapedType::isDynamic(sizeM) && |
| !ShapedType::isDynamic(sizeN) && |
| !ShapedType::isDynamic(sizeK); |
| if (isStaticSize) { |
| /// Try tensorcore config first. |
| if (supportsTensorCore(target, op)) { |
| SmallVector<TileWorkgroupSizePair> TCtileSizeConfig; |
| Type elementType = |
| cast<ShapedType>(op.getDpsInputOperand(0)->get().getType()) |
| .getElementType(); |
| |
| getTensorCoreConfig(TCtileSizeConfig, elementType, sizeM, sizeN, sizeK); |
| // Pick the best configuration where the original shape is aligned on the |
| // tile size. |
| for (TileWorkgroupSizePair &config : TCtileSizeConfig) { |
| if (sizeK % config.tileSize[2] == 0 && |
| sizeN % config.tileSize[1] == 0 && |
| sizeM % config.tileSize[0] == 0) { |
| CodeGenPipeline codegenPipeline = getTensorCorePipeline(elementType); |
| return setMatmulConfig( |
| config.tileSize[0], config.tileSize[1], config.tileSize[2], |
| config.workgroupSize, |
| target.getWgp().getSubgroupSizeChoices().asArrayRef(), |
| sizeK == config.tileSize[2] ? 1 : config.pipelineDepth, |
| codegenPipeline); |
| } |
| } |
| } |
| // Special case for very small matrices. |
| if (sizeM * sizeN <= target.getPreferredSubgroupSize()) { |
| return setMatmulConfig( |
| sizeN, sizeM, 4, {sizeM, sizeN, 1}, |
| target.getWgp().getSubgroupSizeChoices().asArrayRef(), |
| softwarePipelineDepthSimt, CodeGenPipeline::LLVMGPUMatmulSimt); |
| } |
| |
| // SIMT matmul case. Query the best configuration. |
| SmallVector<TileWorkgroupSizePair> tileSizeConfig = getMatmulConfig(target); |
| // Pick the best configuration where the original shape is aligned on the |
| // tile size. |
| for (TileWorkgroupSizePair &config : tileSizeConfig) { |
| if (sizeN % config.tileSize[1] == 0 && sizeM % config.tileSize[0] == 0 && |
| sizeK % config.tileSize[2] == 0) { |
| return setMatmulConfig( |
| config.tileSize[0], config.tileSize[1], config.tileSize[2], |
| config.workgroupSize, |
| target.getWgp().getSubgroupSizeChoices().asArrayRef(), |
| softwarePipelineDepthSimt, CodeGenPipeline::LLVMGPUMatmulSimt); |
| } |
| } |
| } |
| // If we haven't found any config, use the best tile size hoping that |
| // the workgroup specialization handles the main tile path efficiently. |
| SmallVector<TileWorkgroupSizePair> tileSizeConfig = getMatmulConfig(target); |
| constexpr size_t configIndex = 0; |
| const TileWorkgroupSizePair &config = tileSizeConfig[configIndex]; |
| const int64_t tileX = config.tileSize[0]; |
| const int64_t tileY = config.tileSize[1]; |
| int64_t tileK = config.tileSize[2]; |
| // Since specialization doesn't work for K loop and peeling is not enabled yet |
| // we pick a tileK size that is aligned on the K size. |
| if (ShapedType::isDynamic(sizeK)) |
| tileK = 1; |
| while (sizeK % tileK != 0) { |
| tileK >>= 1; |
| } |
| const std::array<int64_t, 3> workgroupSize{config.workgroupSize[0], |
| config.workgroupSize[1], |
| config.workgroupSize[2]}; |
| return setMatmulConfig(tileX, tileY, tileK, workgroupSize, |
| target.getWgp().getSubgroupSizeChoices().asArrayRef(), |
| softwarePipelineDepthSimt, |
| CodeGenPipeline::LLVMGPUMatmulSimt); |
| } |
| |
| //====---------------------------------------------------------------------===// |
| // FFT Pipeline Configuration |
| //====---------------------------------------------------------------------===// |
| |
| static LogicalResult setFftConfig(IREE::GPU::TargetAttr target, |
| mlir::FunctionOpInterface entryPoint, |
| IREE::LinalgExt::FftOp op) { |
| auto interfaceOp = cast<PartitionableLoopsInterface>(*op); |
| auto partitionedLoops = |
| interfaceOp.getPartitionableLoops(kNumMaxParallelDims); |
| unsigned loopDepth = partitionedLoops.back() + 1; |
| SmallVector<int64_t> workgroupTileSize(loopDepth, 0); |
| SmallVector<int64_t, 3> workgroupSize = {target.getPreferredSubgroupSize(), 1, |
| 1}; |
| |
| // Tiling along partitioned loops with size 1. |
| for (int64_t loopIndex : partitionedLoops) { |
| workgroupTileSize[loopIndex] = 1; |
| } |
| auto rank = op.getOperandRank(); |
| if (workgroupTileSize.size() >= rank && workgroupTileSize[rank - 1] != 0) { |
| APInt value; |
| if (matchPattern(op.getStage(), m_ConstantInt(&value))) { |
| workgroupTileSize[rank - 1] = 1ll << value.getSExtValue(); |
| } else { |
| op.emitError("non-constant stage might not work for fft op"); |
| return failure(); |
| } |
| } |
| TileSizesListType tileSizes = {workgroupTileSize}; |
| return setOpConfigAndEntryPointFnTranslation( |
| entryPoint, op, tileSizes, CodeGenPipeline::LLVMGPUDistribute, |
| workgroupSize); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Winograd Pipeline Configuration |
| //===----------------------------------------------------------------------===// |
| template <typename WinogradOp> |
| static LogicalResult setWinogradOpConfig(IREE::GPU::TargetAttr target, |
| mlir::FunctionOpInterface entryPoint, |
| WinogradOp op) { |
| static_assert( |
| llvm::is_one_of<WinogradOp, IREE::LinalgExt::WinogradInputTransformOp, |
| IREE::LinalgExt::WinogradFilterTransformOp, |
| IREE::LinalgExt::WinogradOutputTransformOp>::value, |
| "expected winograd transform op"); |
| auto pipeline = CodeGenPipeline::LLVMGPUWinogradVectorize; |
| TileSizesListType tileSizes; |
| std::array<int64_t, 3> workgroupSize = {32, 4, 4}; |
| int64_t iterationRank = op.getIterationDomainRank(); |
| SmallVector<int64_t> workgroupTileSizes(iterationRank, 4); |
| // Set batch workgroup size |
| workgroupTileSizes.front() = 1; |
| // Set input channel workgroup size |
| workgroupTileSizes.back() = 32; |
| if (isa<IREE::LinalgExt::WinogradFilterTransformOp>(op)) { |
| // Set input channel workgroup size |
| workgroupTileSizes.front() = 32; |
| // Set output channel workgroup size |
| workgroupTileSizes.back() = 16; |
| workgroupSize = {16, 32, 1}; |
| } |
| tileSizes.push_back(workgroupTileSizes); |
| SmallVector<int64_t> threadTileSizes(iterationRank, 1); |
| tileSizes.push_back(threadTileSizes); |
| return setOpConfigAndEntryPointFnTranslation(entryPoint, op, tileSizes, |
| pipeline, workgroupSize); |
| } |
| |
| //====---------------------------------------------------------------------===// |
| // Sort Pipeline Configuration |
| //====---------------------------------------------------------------------===// |
| |
| static LogicalResult setSortConfig(IREE::GPU::TargetAttr target, |
| mlir::FunctionOpInterface entryPoint, |
| Operation *op) { |
| TileSizesListType tileSizes; |
| auto interfaceOp = cast<PartitionableLoopsInterface>(*op); |
| auto partitionedLoops = |
| interfaceOp.getPartitionableLoops(kNumMaxParallelDims); |
| if (partitionedLoops.empty()) { |
| tileSizes.push_back({}); |
| return setOpConfigAndEntryPointFnTranslation( |
| entryPoint, op, tileSizes, CodeGenPipeline::LLVMGPUDistribute, |
| {1, 1, 1}); |
| } |
| size_t numLoops = partitionedLoops.back() + 1; |
| // To get peak occupancy we need a workgroup size of at least two warps |
| std::array<int64_t, 3> workgroupSize = {2 * target.getPreferredSubgroupSize(), |
| 1, 1}; |
| SmallVector<int64_t> workgroupTileSizes(numLoops, 1); |
| // Set all non-parallel loops to zero tile size. |
| llvm::DenseSet<unsigned> partitionedLoopsSet(partitionedLoops.begin(), |
| partitionedLoops.end()); |
| for (auto depth : llvm::seq<int64_t>(0, numLoops)) { |
| if (!partitionedLoopsSet.count(depth)) { |
| workgroupTileSizes[depth] = 0; |
| } |
| } |
| |
| // Tile to have one element per thread. |
| for (int64_t depth = numLoops; depth > 0; depth--) { |
| if (partitionedLoopsSet.count(depth - 1)) { |
| workgroupTileSizes[depth - 1] = workgroupSize[0]; |
| break; |
| } |
| } |
| tileSizes.emplace_back(std::move(workgroupTileSizes)); // Workgroup level |
| return setOpConfigAndEntryPointFnTranslation( |
| entryPoint, op, tileSizes, CodeGenPipeline::LLVMGPUDistribute, |
| workgroupSize); |
| } |
| |
| //====---------------------------------------------------------------------===// |
| // Pack/Unpack Pipeline Configuration |
| //====---------------------------------------------------------------------===// |
| |
| static SmallVector<int64_t> |
| getDefaultWorkgroupTileSizesForPackUnPack(TilingInterface op, |
| int64_t defaultSize) { |
| unsigned numLoops = op.getLoopIteratorTypes().size(); |
| auto partitionedLoops = cast<PartitionableLoopsInterface>(op.getOperation()) |
| .getPartitionableLoops(kNumMaxParallelDims); |
| SmallVector<int64_t> workgroupTileSizes(numLoops, defaultSize); |
| llvm::DenseSet<unsigned> partitionedLoopsSet(partitionedLoops.begin(), |
| partitionedLoops.end()); |
| for (auto dim : llvm::seq<int64_t>(0, workgroupTileSizes.size())) { |
| if (!partitionedLoopsSet.count(dim)) { |
| workgroupTileSizes[dim] = 0; |
| } |
| } |
| |
| return workgroupTileSizes; |
| } |
| |
| static LogicalResult setPackConfig(IREE::GPU::TargetAttr target, |
| mlir::FunctionOpInterface entryPoint, |
| tensor::PackOp packOp) { |
| SmallVector<int64_t> tileSizes = getDefaultWorkgroupTileSizesForPackUnPack( |
| cast<TilingInterface>(packOp.getOperation()), |
| target.getPreferredSubgroupSize()); |
| |
| // The default function aims to returns the number of workload per workgroup, |
| // but it does not know that it is working on packed domain. We need to take |
| // inner tile sizes into account and adjust the distribution tile sizes. |
| SmallVector<int64_t> innerTiles = packOp.getStaticTiles(); |
| ArrayRef<int64_t> dimPos = packOp.getInnerDimsPos(); |
| for (auto [pos, size] : llvm::zip_equal(dimPos, innerTiles)) { |
| if (tileSizes[pos] == 0 || ShapedType::isDynamic(size)) |
| continue; |
| tileSizes[pos] = tileSizes[pos] / size; |
| tileSizes[pos] = std::max<int64_t>(tileSizes[pos], 1); |
| } |
| |
| TileSizesListType tileSizesList = {tileSizes}; |
| std::array<int64_t, 3> workgroupSizes = {target.getPreferredSubgroupSize(), 1, |
| 1}; |
| return setOpConfigAndEntryPointFnTranslation( |
| entryPoint, packOp, tileSizesList, CodeGenPipeline::LLVMGPUPackUnPack, |
| workgroupSizes); |
| } |
| |
| //====---------------------------------------------------------------------===// |
| // Default Pipeline Configuration |
| //====---------------------------------------------------------------------===// |
| |
| // Basic default properties for linalg ops that haven't been tuned. |
| static LogicalResult setRootDefaultConfig(IREE::GPU::TargetAttr target, |
| mlir::FunctionOpInterface entryPoint, |
| Operation *op) { |
| CodeGenPipeline passPipeline = CodeGenPipeline::LLVMGPUDistribute; |
| TileSizesListType tileSizes; |
| auto interfaceOp = cast<PartitionableLoopsInterface>(*op); |
| auto partitionedLoops = interfaceOp.getPartitionableLoops(std::nullopt); |
| if (partitionedLoops.empty()) { |
| tileSizes.push_back({}); |
| return setOpConfigAndEntryPointFnTranslation(entryPoint, op, tileSizes, |
| passPipeline, {1, 1, 1}); |
| } |
| |
| const int preferredSubgroupSize = target.getPreferredSubgroupSize(); |
| size_t numLoops = partitionedLoops.back() + 1; |
| // To get peak occupancy we need a workgroup size of at least two warps. |
| std::array<int64_t, 3> workgroupSize = {2 * preferredSubgroupSize, 1, 1}; |
| unsigned vectorSize = 4; |
| SmallVector<int64_t> workgroupTileSizes(numLoops, 1); |
| // Set all non-parallel loops to zero tile size. |
| llvm::DenseSet<unsigned> partitionedLoopsSet(partitionedLoops.begin(), |
| partitionedLoops.end()); |
| for (auto depth : llvm::seq<int64_t>(0, numLoops)) { |
| if (!partitionedLoopsSet.count(depth)) { |
| workgroupTileSizes[depth] = 0; |
| } |
| } |
| int64_t skipInnerTiling = 0; |
| if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) { |
| for (auto [index, outputOperand] : |
| llvm::enumerate(genericOp.getDpsInitsMutable())) { |
| if (!genericOp.getMatchingIndexingMap(&outputOperand) |
| .isProjectedPermutation()) { |
| vectorSize = 1; |
| break; |
| } |
| ArrayRef<int64_t> shape = |
| llvm::cast<ShapedType>(outputOperand.get().getType()).getShape(); |
| if (llvm::any_of(shape, ShapedType::isDynamic)) { |
| vectorSize = 1; |
| break; |
| } |
| // Since we vectorize along the most inner dimension, make sure if can be |
| // divided by number of threads * vectorSize. |
| while (vectorSize > 1 && |
| shape.back() % (workgroupSize[0] * vectorSize) != 0) { |
| vectorSize /= 2; |
| } |
| if (vectorSize == 1) // assume there is fastpath + slowpath |
| vectorSize = 4; |
| int64_t problemSize = std::accumulate( |
| shape.begin(), shape.end(), 1, |
| [](const int64_t &a, const int64_t &b) { return a * b; }); |
| if ((problemSize / (preferredSubgroupSize * vectorSize)) < 64) { |
| vectorSize = 1; |
| break; |
| } |
| // If the inner dimension is too small to have one element per thread |
| // reduce the workgroup size try to distribute amongst more dimensions. |
| if (shape.back() < vectorSize * workgroupSize[0]) { |
| int64_t flatWG = workgroupSize[0]; |
| vectorSize = 1; |
| int64_t id = 0; |
| for (int64_t dim : llvm::reverse(shape)) { |
| // Unit loops are already skipped. |
| if (dim == 1) |
| continue; |
| if (dim < flatWG) { |
| skipInnerTiling++; |
| workgroupSize[id] = dim; |
| } else { |
| workgroupSize[id] = flatWG; |
| break; |
| } |
| flatWG = flatWG / dim; |
| id++; |
| if (flatWG <= 1 || id >= workgroupSize.size()) |
| break; |
| } |
| break; |
| } |
| } |
| } |
| |
| auto linalgOp = dyn_cast<linalg::LinalgOp>(op); |
| // Pick a vectorSize of 1 for op that we know won't get vectorized. |
| // Also skip vectorization for linalg on memref (no result) as the pipeline |
| // relies on tensor level tiling. |
| // TODO(thomasraoux): This could be improved by checking if the linalg op |
| // would fail vectorization. |
| if (!linalgOp || op->getNumResults() != 1 || |
| llvm::any_of(linalgOp.getIndexingMapsArray(), |
| [](AffineMap m) { return !m.isProjectedPermutation(); })) { |
| vectorSize = 1; |
| } else { |
| passPipeline = CodeGenPipeline::LLVMGPUVectorize; |
| } |
| |
| int64_t id = 0; |
| // Set the inner most parallel loop to `lowerTs`. |
| for (int64_t depth = numLoops; depth > 0; depth--) { |
| if (partitionedLoopsSet.count(depth - 1)) { |
| if (skipInnerTiling > 0) { |
| // For dimensions that don't need to be distributed across blocks skip |
| // tiling by setting tile size to 0. |
| workgroupTileSizes[depth - 1] = 0; |
| skipInnerTiling--; |
| id++; |
| if (id >= workgroupSize.size()) |
| break; |
| continue; |
| } |
| workgroupTileSizes[depth - 1] = workgroupSize[id] * vectorSize; |
| break; |
| } |
| } |
| |
| if (linalgOp) { |
| // Tile reduction dimension to 4 to allow doing load4 if the reduction size |
| // is the most inner dimension. |
| workgroupTileSizes.append(linalgOp.getNumReductionLoops(), 4); |
| } |
| tileSizes.emplace_back(std::move(workgroupTileSizes)); // Workgroup level |
| return setOpConfigAndEntryPointFnTranslation(entryPoint, op, tileSizes, |
| passPipeline, workgroupSize, |
| preferredSubgroupSize); |
| } |
| |
| //====---------------------------------------------------------------------===// |
| // Transform Dialect Pipeline Configuration |
| //====---------------------------------------------------------------------===// |
| |
| /// Set configuration for transform dialect based strategies. |
| static LogicalResult |
| setTransformDialectConfig(IREE::GPU::TargetAttr target, |
| mlir::FunctionOpInterface entryPoint, Operation *op) { |
| if (!clGPUEnableTransformDialectJit) { |
| return failure(); |
| } |
| |
| auto translationInfo = IREE::Codegen::TranslationInfoAttr::get( |
| entryPoint.getContext(), CodeGenPipeline::TransformDialectCodegen); |
| |
| // TODO: unify the target informations into one structure. |
| iree_compiler::gpu::GPUModel gpuModel; |
| gpuModel.hasWarpShuffle = target.supportsSubgroupShuffle(); |
| gpuModel.hasTF32TensorCore = target.supportsTF32InputMMAOps(); |
| gpuModel.hasMmaSync = target.supportsSyncMMAOps(); |
| |
| // Populates a subset of the fragment combinations supported in MLIR lowerings |
| // to NVVM (which is itself a subset of what LLVM supports) based on what the |
| // pipeline currently supports. |
| // TODO: avoid hard coding this and populate based on hardware capabilities. |
| // TODO: add missing supported configs once the pipeline supports it. |
| MLIRContext *context = entryPoint.getContext(); |
| Type f32Type = Float32Type::get(context); |
| Type f16Type = Float16Type::get(context); |
| |
| iree_compiler::gpu::MMAConfig f16f32AccConfig = { |
| /*m=*/16, /*n=*/16, /*k=*/16, |
| /*aType=*/f16Type, /*bType=*/f16Type, /*cType=*/f32Type}; |
| iree_compiler::gpu::MMAConfig f16f16AccConfig = { |
| /*m=*/16, /*n=*/16, /*k=*/16, |
| /*aType=*/f16Type, /*bType=*/f16Type, /*cType=*/f16Type}; |
| gpuModel.supportedWMMAConfigs = {f16f32AccConfig, f16f16AccConfig}; |
| |
| if (target.supportsTF32InputMMAOps()) { |
| iree_compiler::gpu::MMAConfig tf32WmmaConfig = { |
| /*m=*/16, /*n=*/16, /*k=*/8, |
| /*aType=*/f32Type, /*bType=*/f32Type, /*cType=*/f32Type}; |
| gpuModel.supportedWMMAConfigs.push_back(tf32WmmaConfig); |
| } |
| |
| if (failed(iree_compiler::gpu::matchAndSetTransformStrategy(entryPoint, op, |
| gpuModel))) |
| return failure(); |
| return setTranslationInfo(entryPoint, translationInfo); |
| } |
| |
| static bool isMatvecLike(linalg::LinalgOp linalgOp) { |
| if (linalgOp.getNumParallelLoops() != 2) |
| return false; |
| |
| if (linalgOp.getNumReductionLoops() != 1) |
| return false; |
| |
| // TODO: Allow for matvec with fused dequantization. |
| FailureOr<linalg::ContractionDimensions> dims = |
| linalg::inferContractionDims(linalgOp); |
| if (failed(dims)) |
| return false; |
| |
| // TODO: Support batch matvec. |
| if (!dims->batch.empty()) |
| return false; |
| |
| for (ArrayRef indices : {dims->m, dims->n, dims->k}) { |
| if (!llvm::hasSingleElement(indices)) |
| return false; |
| } |
| |
| // Check if the first parallel dimension has bound 1, indicating we found a |
| // vector shape. |
| SmallVector<int64_t, 4> bounds = linalgOp.getStaticLoopRanges(); |
| if (bounds[dims->m.front()] != 1) |
| return false; |
| |
| return true; |
| } |
| |
| //====---------------------------------------------------------------------===// |
| // Warp Reduction Pipeline Configuration |
| //====---------------------------------------------------------------------===// |
| |
| /// Set the configuration for reductions that can be mapped to warp reductions. |
| static LogicalResult |
| setWarpReductionConfig(IREE::GPU::TargetAttr target, |
| mlir::FunctionOpInterface entryPoint, |
| linalg::LinalgOp op) { |
| if (!target.supportsSubgroupShuffle()) |
| return failure(); |
| |
| SmallVector<unsigned> parallelDims; |
| SmallVector<unsigned> reductionDims; |
| op.getParallelDims(parallelDims); |
| op.getReductionDims(reductionDims); |
| |
| SmallVector<int64_t, 4> bounds = op.getStaticLoopRanges(); |
| int64_t numParallelDims = op.getNumParallelLoops(); |
| |
| if (reductionDims.empty()) |
| return failure(); |
| |
| // Make sure reduction dimensions are static and innermost ones. |
| int64_t numDynamicReductionDims = 0; |
| for (unsigned dim : reductionDims) { |
| if (ShapedType::isDynamic(bounds[dim])) { |
| numDynamicReductionDims++; |
| } |
| if (dim < numParallelDims) { |
| return failure(); |
| } |
| } |
| |
| // Distribution of multi-dim masked writes currently aren't fully supported. |
| if (numDynamicReductionDims > 1) { |
| return failure(); |
| } |
| |
| if (op.getRegionOutputArgs().size() != 1) |
| return failure(); |
| |
| // Only support projected permutation, this could be extended to projected |
| // permutated with broadcast. |
| if (llvm::any_of(op.getDpsInputOperands(), [&](OpOperand *input) { |
| return !op.getMatchingIndexingMap(input).isProjectedPermutation(); |
| })) |
| return failure(); |
| |
| bool foundSingleReductionOutput = false; |
| for (auto [index, initOpOperand] : llvm::enumerate(op.getDpsInitsMutable())) { |
| // Only single combiner operations are supported for now. |
| SmallVector<Operation *> combinerOps; |
| if (matchReduction(op.getRegionOutputArgs(), index, combinerOps) && |
| combinerOps.size() == 1) { |
| if (foundSingleReductionOutput) |
| return failure(); |
| foundSingleReductionOutput = true; |
| continue; |
| } |
| if (!op.getMatchingIndexingMap(&initOpOperand).isIdentity()) |
| return failure(); |
| } |
| if (!foundSingleReductionOutput) |
| return failure(); |
| |
| // Tile all the parallel dimension to 1. |
| SmallVector<unsigned> partitionedLoops = |
| cast<PartitionableLoopsInterface>(op.getOperation()) |
| .getPartitionableLoops(kNumMaxParallelDims); |
| size_t numLoops = partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1; |
| SmallVector<int64_t> workgroupTileSizes(numLoops, 1); |
| |
| // Without any bounds on dynamic reduction dims, we need specialization to |
| // get peak performance. For now, just use the warp size. |
| if (numDynamicReductionDims) { |
| SmallVector<int64_t> reductionTileSizes(op.getNumLoops(), 0); |
| int64_t preferredSubgroupSize = target.getPreferredSubgroupSize(); |
| reductionTileSizes[reductionDims[0]] = preferredSubgroupSize; |
| TileSizesListType tileSizes; |
| tileSizes.emplace_back(std::move(workgroupTileSizes)); // Workgroup level |
| tileSizes.emplace_back(std::move(reductionTileSizes)); // Reduction level |
| std::array<int64_t, 3> workgroupSize = {preferredSubgroupSize, 1, 1}; |
| if (failed(setOpConfigAndEntryPointFnTranslation( |
| entryPoint, op, tileSizes, CodeGenPipeline::LLVMGPUWarpReduction, |
| workgroupSize))) { |
| return failure(); |
| } |
| return success(); |
| } |
| |
| int64_t reductionSize = 1; |
| for (int64_t dim : reductionDims) |
| reductionSize *= bounds[dim]; |
| |
| int64_t subgroupSize = 0; |
| for (int s : target.getWgp().getSubgroupSizeChoices().asArrayRef()) { |
| if (reductionSize % s == 0) { |
| subgroupSize = s; |
| break; |
| } |
| } |
| if (subgroupSize == 0) |
| return failure(); |
| |
| const Type elementType = |
| llvm::cast<ShapedType>(op.getDpsInitOperand(0)->get().getType()) |
| .getElementType(); |
| if (!elementType.isIntOrFloat()) |
| return failure(); |
| unsigned bitWidth = elementType.getIntOrFloatBitWidth(); |
| // Reduction distribution only supports 8/16/32 bit types now. |
| if (bitWidth != 32 && bitWidth != 16 && bitWidth != 8) |
| return failure(); |
| |
| const unsigned largestLoadSizeInBits = 128; |
| unsigned vectorSize = largestLoadSizeInBits / bitWidth; |
| while ((reductionSize / vectorSize) % subgroupSize != 0) |
| vectorSize /= 2; |
| |
| // Deduce the workgroup size we should use for reduction. Currently a |
| // workgroup processes all elements in reduction dimensions. Need to make sure |
| // the workgroup size we use can divide the total reduction size, and it's |
| // also within hardware limitations. |
| const int64_t maxWorkgroupSize = 1024; |
| int64_t groupSize = reductionSize / vectorSize; |
| if (groupSize > maxWorkgroupSize) { |
| groupSize = llvm::APIntOps::GreatestCommonDivisor( |
| {64, uint64_t(groupSize)}, {64, uint64_t(maxWorkgroupSize)}) |
| .getZExtValue(); |
| } |
| |
| // Then we need to strike a balance-- |
| // 1) parallel dimensions are distributed to workgroups. If there are many |
| // workgroups dispatched, we'd want to have each GPU core hosting multiple |
| // of them for occupancy. |
| // 2) we want each thread to read quite a few 128-bit vectors for better |
| // memory cache behavior. |
| // Both means we cannot use a too large workgroup size. |
| |
| std::optional<int64_t> parallelSize = 1; |
| for (int64_t dim : parallelDims) { |
| if (ShapedType::isDynamic(bounds[dim])) { |
| parallelSize = std::nullopt; |
| break; |
| } |
| *parallelSize *= bounds[dim]; |
| } |
| // Total parallel size that can fill the GPU with enough workgorups. |
| // TODO: query from the target device; roughly 2x hardware compute unit. |
| const int parallelThreshold = 256; |
| // How many 128-bit vectors each thread should at least read. |
| const int targetVectorCount = 8; |
| while (parallelSize && *parallelSize > parallelThreshold && |
| (groupSize / 2) % subgroupSize == 0 && |
| reductionSize / (groupSize * vectorSize) < targetVectorCount) { |
| // Use less subgroups per workgroup.. |
| groupSize /= 2; |
| // in order to host more workgroups per hardware compute unit. |
| *parallelSize /= 2; |
| } |
| |
| // Current warp reduction pattern is a two step butterfly warp reduce. |
| // First, do warp reductions along multiple subgroups. |
| // Second, reduce results from multiple subgroups using single warp reduce. |
| // The final warp reduce requires subgroup count <= subgroup size to work. |
| if ((groupSize / subgroupSize) > subgroupSize) |
| return failure(); |
| |
| // With just one subgroup per workgroup, make each subgroup do more work and |
| // process a few reductions (rows) along the last parallel dimension. |
| // |
| // TODO: This is enabled for matvec on ROCm for now. We should |
| // validate this strategy and extend to more linalg generics and to CUDA. |
| if (isROCmBackend(target) && llvm::none_of(bounds, ShapedType::isDynamic) && |
| isMatvecLike(op)) { |
| int64_t lastParallelBound = bounds[parallelDims.back()]; |
| int64_t numParallelReductions = 1; |
| const int64_t maxParallelFactor = groupSize / 4; |
| for (int64_t parallelFactor = 2; |
| (parallelFactor < maxParallelFactor) && |
| (lastParallelBound % parallelFactor == 0) && |
| (lastParallelBound > parallelFactor); |
| parallelFactor *= 2) { |
| numParallelReductions = parallelFactor; |
| } |
| workgroupTileSizes.back() = numParallelReductions; |
| } |
| |
| std::array<int64_t, 3> workgroupSize = {groupSize, 1, 1}; |
| SmallVector<int64_t> reductionTileSizes(op.getNumLoops(), 0); |
| int64_t remainingGroupSize = groupSize; |
| for (int i = reductionDims.size() - 1; i >= 0; --i) { |
| int64_t dim = reductionDims[i]; |
| int64_t bound = bounds[dim]; |
| if (i == reductionDims.size() - 1) |
| bound /= vectorSize; |
| APInt size = llvm::APIntOps::GreatestCommonDivisor( |
| {64, uint64_t(remainingGroupSize)}, {64, uint64_t(bound)}); |
| reductionTileSizes[dim] = size.getSExtValue(); |
| if (i == reductionDims.size() - 1) |
| reductionTileSizes[dim] *= vectorSize; |
| remainingGroupSize /= size.getSExtValue(); |
| } |
| TileSizesListType tileSizes; |
| tileSizes.emplace_back(std::move(workgroupTileSizes)); // Workgroup level |
| tileSizes.emplace_back(std::move(reductionTileSizes)); // Reduction level |
| return setOpConfigAndEntryPointFnTranslation( |
| entryPoint, op, tileSizes, CodeGenPipeline::LLVMGPUWarpReduction, |
| workgroupSize, subgroupSize); |
| return success(); |
| } |
| |
| static bool hasTwoOrThreeLoopsInfo(linalg::LinalgOp linalgOp) { |
| return linalgOp.getNumParallelLoops() >= 2 && |
| linalgOp.getNumParallelLoops() <= 3; |
| } |
| |
| //====---------------------------------------------------------------------===// |
| // Transpose Pipeline Configuration |
| //====---------------------------------------------------------------------===// |
| |
| static LogicalResult setTransposeConfig(mlir::FunctionOpInterface entryPoint, |
| linalg::LinalgOp linalgOp) { |
| LinalgOpInfo opInfo(linalgOp, sharedMemTransposeFilter); |
| |
| // Checks preconditions for shared mem transpose. |
| if (!opInfo.isTranspose() || opInfo.isDynamic() || opInfo.isReduction() || |
| !isa<linalg::GenericOp>(linalgOp) || !hasTwoOrThreeLoopsInfo(linalgOp)) { |
| return failure(); |
| } |
| |
| ArrayRef<OpOperand *> transposedOperands = opInfo.getTransposeOperands(); |
| |
| // Determine the fastest moving dimensions for the source/destination indices |
| // of each transpose. These inform the tile sizes. |
| int64_t outputFastestDim = linalgOp.getNumLoops() - 1; |
| int64_t inputFastestDim = |
| linalgOp.getMatchingIndexingMap(transposedOperands[0]) |
| .getDimPosition(outputFastestDim); |
| // Ensure the other transposed operands match |
| for (int i = 1; i < transposedOperands.size(); ++i) { |
| if (inputFastestDim != |
| linalgOp.getMatchingIndexingMap(transposedOperands[i]) |
| .getDimPosition(outputFastestDim)) { |
| return failure(); |
| } |
| } |
| |
| int32_t tileM = 32; |
| int32_t tileN = 32; |
| TileSizesListType tileSizes; |
| // Set all tile sizes to 1 except for fastest moving dimensions. |
| SmallVector<int64_t> tileSizesTemp(linalgOp.getNumLoops(), 1); |
| tileSizesTemp[outputFastestDim] = 32; |
| tileSizesTemp[inputFastestDim] = 32; |
| tileSizes.push_back(tileSizesTemp); |
| |
| // Check alignment with tile size for each transpose. Only the fastest moving |
| // dims need to match the transpose tile. |
| auto loopRanges = linalgOp.getStaticLoopRanges(); |
| if (loopRanges[outputFastestDim] % tileM != 0 || |
| loopRanges[inputFastestDim] % tileN != 0) { |
| return failure(); |
| } |
| |
| // Workgroup size contains 8 warps. Configured with 8 threads on fastest |
| // moving dimension so each thread can execute a vectorized copy of 4 |
| // contigious elements at a time from the 32 block. |
| std::array<int64_t, 3> workgroupSize = {8, 32, 1}; |
| |
| return setOpConfigAndEntryPointFnTranslation( |
| entryPoint, linalgOp, tileSizes, |
| CodeGenPipeline::LLVMGPUTransposeSharedMem, workgroupSize); |
| } |
| |
| //====---------------------------------------------------------------------===// |
| // UKernel Pipeline Configuration |
| //====---------------------------------------------------------------------===// |
| |
| /// Set the configuration for argmax that can be mapped to argmax uKernel. |
| /// Distribute all parallel dim across different workgroups, and only use single |
| /// subgroup per workgroup. |
| static LogicalResult |
| setArgmaxUkernelConfig(IREE::GPU::TargetAttr target, |
| mlir::FunctionOpInterface entryPoint, |
| linalg::GenericOp op) { |
| // Checks if UKernels are enabled. |
| if (auto target = IREE::HAL::ExecutableTargetAttr::lookup(entryPoint)) { |
| const char ukernelName[] = "argmax"; |
| if (!hasUkernel(target, ukernelName) || |
| !hasUkernelSupportedGpuArch(target)) { |
| return failure(); |
| } |
| } |
| |
| if (!target.supportsSubgroupShuffle()) |
| return failure(); |
| |
| if (failed(isArgmaxOp(op))) |
| return failure(); |
| SmallVector<unsigned> parallelDims; |
| SmallVector<unsigned> reductionDims; |
| op.getParallelDims(parallelDims); |
| op.getReductionDims(reductionDims); |
| |
| // Currently Argmax UKernel only support 1 reduction dim. |
| if (reductionDims.size() != 1) |
| return failure(); |
| |
| // Make sure reduction dimensions are static and innermost ones. |
| SmallVector<int64_t, 4> bounds = op.getStaticLoopRanges(); |
| int64_t numParallelDims = op.getNumParallelLoops(); |
| int64_t numDynamicReductionDims = 0; |
| for (unsigned dim : reductionDims) { |
| if (ShapedType::isDynamic(bounds[dim])) { |
| numDynamicReductionDims++; |
| } |
| if (dim < numParallelDims) { |
| return failure(); |
| } |
| } |
| |
| // Distribution of multi-dim masked writes currently aren't fully supported. |
| if (numDynamicReductionDims > 1) { |
| return failure(); |
| } |
| |
| // Tile all the parallel dimension to 1. |
| SmallVector<unsigned> partitionedLoops = |
| cast<PartitionableLoopsInterface>(op.getOperation()) |
| .getPartitionableLoops(kNumMaxParallelDims); |
| size_t numLoops = partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1; |
| SmallVector<int64_t> workgroupTileSizes(numLoops, 1); |
| |
| // Currently Argmax Ukernel let's every thread reduce reductionDim/WarpSize |
| // number of elements, and then it does a single step butterfly warp reduce. |
| // Hence it expects workgroupSize to be warpSize(subgroupSize), and |
| // reductionTileSize to be size of the reduction dim. |
| SmallVector<int64_t> reductionTileSizes(op.getNumLoops(), 0); |
| int64_t preferredSubgroupSize = target.getPreferredSubgroupSize(); |
| reductionTileSizes[reductionDims[0]] = preferredSubgroupSize; |
| TileSizesListType tileSizes; |
| tileSizes.emplace_back(std::move(workgroupTileSizes)); // Workgroup level |
| tileSizes.emplace_back(std::move(reductionTileSizes)); // Reduction level |
| std::array<int64_t, 3> workgroupSize = {preferredSubgroupSize, 1, 1}; |
| if (failed(setOpConfigAndEntryPointFnTranslation( |
| entryPoint, op, tileSizes, CodeGenPipeline::LLVMGPUDefault, |
| workgroupSize))) { |
| return failure(); |
| } |
| return success(); |
| } |
| |
| /// Make UKernels take the LLVMGPUDefault lowering pipeline. |
| static LogicalResult |
| setUKernelConfig(mlir::FunctionOpInterface entryPoint, |
| IREE::Codegen::UKernelOpInterface ukernelOp) { |
| auto translationInfo = IREE::Codegen::TranslationInfoAttr::get( |
| entryPoint->getContext(), CodeGenPipeline::LLVMGPUDefault); |
| return setTranslationInfo(entryPoint, translationInfo); |
| } |
| |
| /// Decides the tiling and distribution parameters for one convolution |
| /// dimension. Returns true if we can succesfully deduce. |
| /// |
| /// - `inputDim` is the size of the dimension to be distributed. |
| /// - `residualThreads` is the remaining threads we can distribute. |
| /// - `residualTilingFactor` indicates the remaining tiling scale factor. |
| /// - `wgDimSize` will be updated with the decided workgroup dimension size. |
| /// - `wgTileSize` will be updated with the decided workgroup tile size. |
| /// - `invoTileSize` will be updated with the decided invocation tile size. |
| static bool distributeToOneDim(const int64_t inputDim, |
| const bool isInnerMostDim, |
| int64_t &residualThreads, |
| int64_t &residualTilingFactor, |
| int64_t &wgDimSize, int64_t &wgTileSize) { |
| const int64_t lb = isInnerMostDim ? 2 : 1; |
| for (int64_t dim = residualThreads; dim >= lb; dim >>= 1) { |
| int64_t chosenTileSize = 0; |
| if (isInnerMostDim) { |
| // Handle 4 elements per thread for the innermost dimension. We need |
| // this for vectorized load. |
| chosenTileSize = 4; |
| if (inputDim % (dim * chosenTileSize) != 0) |
| continue; |
| } else { |
| for (int64_t t = residualTilingFactor; t >= 1; t >>= 1) |
| if (inputDim % (dim * t) == 0) { |
| chosenTileSize = t; |
| break; |
| } |
| } |
| if (chosenTileSize) { |
| wgDimSize = dim; |
| wgTileSize = dim * chosenTileSize; |
| residualThreads /= dim; |
| residualTilingFactor /= chosenTileSize; |
| return true; |
| } |
| } |
| return false; |
| }; |
| |
| /// Decides the tiling and distribution parameters for two convolution window |
| /// dimensions to two workgroup dimensions as a square. Returns true if we can |
| /// succesfully deduce. |
| static bool distributeToSquare(const int64_t oh, const int64_t ow, |
| int64_t &residualThreads, |
| int64_t &residualTilingFactor, |
| MutableArrayRef<int64_t> wgDimSizes, |
| MutableArrayRef<int64_t> wgTileSizes) { |
| assert(wgDimSizes.size() == 2 && wgTileSizes.size() == 2); |
| |
| const unsigned log2Threads = llvm::Log2_64(residualThreads); |
| if (oh == ow && residualThreads != 1 && log2Threads % 2 == 0) { |
| const int64_t yz = 1ll << (log2Threads / 2); |
| |
| int64_t chosenTileSize = 1ll << (llvm::Log2_64(residualTilingFactor) / 2); |
| while (chosenTileSize >= 1 && ow % (yz * chosenTileSize) != 0) { |
| chosenTileSize >>= 1; |
| } |
| |
| if (chosenTileSize != 0) { |
| wgDimSizes.front() = wgDimSizes.back() = yz; |
| wgTileSizes.front() = wgTileSizes.back() = yz * chosenTileSize; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| //====---------------------------------------------------------------------===// |
| // Convolution Pipeline Configuration |
| //====---------------------------------------------------------------------===// |
| |
| static LogicalResult setConvolutionConfig(IREE::GPU::TargetAttr target, |
| linalg::LinalgOp linalgOp, |
| const int64_t bestTilingFactor) { |
| if (!isa<linalg::Conv2DNhwcHwcfOp, linalg::Conv2DNchwFchwOp>(linalgOp)) { |
| return failure(); |
| } |
| const bool isNCHW = isa<linalg::Conv2DNchwFchwOp>(*linalgOp); |
| const bool isNHWC = isa<linalg::Conv2DNhwcHwcfOp>(*linalgOp); |
| |
| const int ohIndex = isNHWC ? 1 : 2; |
| const int owIndex = isNHWC ? 2 : 3; |
| const int ocIndex = isNHWC ? 3 : 1; |
| |
| Type inputType = linalgOp.getDpsInputOperand(0)->get().getType(); |
| ArrayRef<int64_t> inputShape = llvm::cast<ShapedType>(inputType).getShape(); |
| Type outputType = linalgOp.getDpsInitOperand(0)->get().getType(); |
| ArrayRef<int64_t> outputShape = llvm::cast<ShapedType>(outputType).getShape(); |
| if (ShapedType::isDynamic(inputShape[3]) || |
| llvm::any_of(outputShape.drop_front(), ShapedType::isDynamic)) { |
| return failure(); |
| } |
| int64_t oh = outputShape[ohIndex], ow = outputShape[owIndex], |
| oc = outputShape[ocIndex]; |
| |
| // The core idea is to distribute the convolution dimensions to the workgroup |
| // Z/Y/X dimensions, with each thread in a workgroup handling multiple vector |
| // elements. We try to 1) utilize all threads in a subgroup, and 2) handle an |
| // optimal tile size along each dimension. |
| int64_t residualThreads = target.getPreferredSubgroupSize(); |
| int64_t residualTilingFactor = bestTilingFactor; |
| |
| SmallVector<int64_t, 3> workgroupSize(3, 1); // (X, Y, Z) |
| SmallVector<int64_t> workgroupTileSizes(4, 1); |
| |
| if (isNCHW) { |
| // OW -> x, OH -> y, OC -> z |
| if (!distributeToOneDim(ow, /*isInnerMostDim=*/true, residualThreads, |
| residualTilingFactor, workgroupSize[0], |
| workgroupTileSizes[3]) || |
| !distributeToOneDim(oh, /*isInnerMostDim=*/false, residualThreads, |
| residualTilingFactor, workgroupSize[1], |
| workgroupTileSizes[2]) || |
| !distributeToOneDim(oc, /*isInnerMostDim=*/false, residualThreads, |
| residualTilingFactor, workgroupSize[2], |
| workgroupTileSizes[1])) { |
| return failure(); |
| } |
| } else { |
| // OC -> x |
| if (!distributeToOneDim(oc, /*isInnerMostDim=*/true, residualThreads, |
| residualTilingFactor, workgroupSize[0], |
| workgroupTileSizes[3])) |
| return failure(); |
| |
| // Deduce the configruation for the OW and OH dimension. Try to make them |
| // even if possible given we typically have images with the same height |
| // and width. |
| const bool tileToSquare = distributeToSquare( |
| oh, ow, residualThreads, residualTilingFactor, |
| llvm::MutableArrayRef(workgroupSize).drop_front(), |
| llvm::MutableArrayRef(workgroupTileSizes).drop_front().drop_back()); |
| |
| // Otherwise treat OW and OH separately to allow them to have different |
| // number of threads and tiling size. |
| if (!tileToSquare) { |
| if (!distributeToOneDim(ow, /*isInnerMostDim=*/false, residualThreads, |
| residualTilingFactor, workgroupSize[1], |
| workgroupTileSizes[2]) || |
| !distributeToOneDim(oh, /*isInnerMostDim=*/false, residualThreads, |
| residualTilingFactor, workgroupSize[2], |
| workgroupTileSizes[1])) { |
| return failure(); |
| } |
| } |
| } |
| auto pipeline = CodeGenPipeline::LLVMGPUVectorize; |
| TileSizesListType tileSizes; |
| // Add reduction tile sizes. |
| if (isNCHW) |
| workgroupTileSizes.append({4, 1, 1}); |
| else if (isNHWC) |
| workgroupTileSizes.append({1, 1, 4}); |
| tileSizes.push_back(workgroupTileSizes); |
| |
| // Tile along OH by size 1 to enable downsizing 2-D convolution to 1-D. |
| SmallVector<int64_t> windowTileSizes(4, 0); |
| windowTileSizes[ohIndex] = 1; |
| tileSizes.push_back(windowTileSizes); |
| auto funcOp = linalgOp->getParentOfType<mlir::FunctionOpInterface>(); |
| return setOpConfigAndEntryPointFnTranslation(funcOp, linalgOp, tileSizes, |
| pipeline, workgroupSize); |
| } |
| |
| //====---------------------------------------------------------------------===// |
| // Pipeline Configuration |
| //====---------------------------------------------------------------------===// |
| |
| static LogicalResult setRootConfig(IREE::GPU::TargetAttr target, |
| mlir::FunctionOpInterface entryPointFn, |
| Operation *computeOp) { |
| LLVM_DEBUG({ |
| DBGS() << "Selecting root config for: "; |
| computeOp->print(llvm::dbgs(), OpPrintingFlags().skipRegions()); |
| llvm::dbgs() << "\n"; |
| }); |
| // First try to see if there is a transform dialect configuration existing. |
| if (succeeded(setTransformDialectConfig(target, entryPointFn, computeOp))) { |
| LDBG("Transform Dialect Config"); |
| return success(); |
| } |
| if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn, |
| computeOp))) { |
| LDBG("Tile and fuse data tiled multi_mma config"); |
| return success(); |
| } |
| if (clGPUTestTileAndFuseMatmul) { |
| if (succeeded(IREE::GPU::setMatmulLoweringConfig(target, entryPointFn, |
| computeOp))) { |
| LDBG("Tile and fuse matmul config"); |
| return success(); |
| } |
| } |
| if (clGPUTestTileAndFuseVectorize) { |
| if (succeeded(IREE::GPU::setTileAndFuseLoweringConfig(target, entryPointFn, |
| computeOp))) { |
| LDBG("Tile and fuse default config"); |
| return success(); |
| } |
| } |
| if (succeeded(setVectorDistributionConfig(target, entryPointFn, computeOp))) { |
| return success(); |
| } |
| |
| if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) { |
| if (succeeded(setContractConfig(target, entryPointFn, linalgOp))) { |
| LDBG("Contract Config"); |
| return success(); |
| } |
| if (succeeded(setWarpReductionConfig(target, entryPointFn, linalgOp))) { |
| LDBG("Warp Reduction Config"); |
| return success(); |
| } |
| if (succeeded(setConvolutionConfig(target, linalgOp, 16))) { |
| LDBG("Convolution Config"); |
| return success(); |
| } |
| auto genericOp = dyn_cast<linalg::GenericOp>(computeOp); |
| if (genericOp && succeeded(setTransposeConfig(entryPointFn, genericOp))) { |
| LDBG("Transpose Config"); |
| return success(); |
| } else if (genericOp && succeeded(setArgmaxUkernelConfig( |
| target, entryPointFn, genericOp))) { |
| LDBG("Argmax Ukernel Config"); |
| return success(); |
| } |
| } |
| return TypeSwitch<Operation *, LogicalResult>(computeOp) |
| .Case<IREE::LinalgExt::FftOp>([&](auto fftOp) { |
| LDBG("FFT Config"); |
| return setFftConfig(target, entryPointFn, fftOp); |
| }) |
| .Case<IREE::LinalgExt::SortOp>([&](auto sortOp) { |
| LDBG("Sort Config"); |
| return setSortConfig(target, entryPointFn, sortOp); |
| }) |
| .Case<IREE::LinalgExt::WinogradInputTransformOp, |
| IREE::LinalgExt::WinogradOutputTransformOp, |
| IREE::LinalgExt::WinogradFilterTransformOp>([&](auto winogradOp) { |
| LDBG("Winograd Config"); |
| return setWinogradOpConfig(target, entryPointFn, winogradOp); |
| }) |
| .Case<tensor::PackOp>([&](auto packOp) { |
| LDBG("Pack Config"); |
| return setPackConfig(target, entryPointFn, packOp); |
| }) |
| .Case<IREE::Codegen::UKernelOpInterface>([&](auto ukernelOp) { |
| LDBG("Ukernel Config"); |
| return setUKernelConfig(entryPointFn, ukernelOp); |
| }) |
| .Default([&](auto op) { |
| LDBG("Default Config"); |
| return setRootDefaultConfig(target, entryPointFn, computeOp); |
| }); |
| } |
| |
| // Propogate the configuration to the other ops. |
| // TODO(ravishankarm, thomasraoux): This is a very specific use (and |
| // fragile). In general, this should not be needed. Things are already tiled |
| // and distributed. The rest of the compilation must be structured to either |
| // use `TileAndFuse` or they are independent configurations that are |
| // determined based on the op. |
| static void propagateLoweringConfig(Operation *rootOperation, |
| SmallVector<Operation *> computeOps) { |
| if (IREE::Codegen::LoweringConfigAttrInterface config = |
| getLoweringConfig(rootOperation)) { |
| for (auto op : computeOps) { |
| if (op == rootOperation) |
| continue; |
| setLoweringConfig(op, config); |
| } |
| } |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Entry Point |
| //===----------------------------------------------------------------------===// |
| LogicalResult initGPULaunchConfig(FunctionOpInterface funcOp) { |
| IREE::GPU::TargetAttr target = getGPUTargetAttr(funcOp); |
| if (!target) |
| return funcOp.emitError("missing GPU target in #hal.executable.target"); |
| |
| auto exportOp = getEntryPoint(funcOp); |
| if (!getTranslationInfo(funcOp) && exportOp) { |
| // If no translation info set, first check whether we already have |
| // workgroup count set--it's a "contract" to indicate that we should |
| // bypass all tiling and distribution to go down just the most basic |
| // lowering flow. |
| if (Block *body = exportOp->getWorkgroupCountBody()) { |
| auto retOp = cast<IREE::HAL::ReturnOp>(body->getTerminator()); |
| // For scalar dispatch cases--using just one thread of one workgroup. |
| auto isOne = [](Value value) { return matchPattern(value, m_One()); }; |
| if (llvm::all_of(retOp.getOperands(), isOne)) { |
| SmallVector<int64_t, 3> workgroupSize = {1, 1, 1}; |
| auto translationInfo = IREE::Codegen::TranslationInfoAttr::get( |
| funcOp.getContext(), CodeGenPipeline::LLVMGPUBaseLowering, |
| workgroupSize); |
| if (failed(setTranslationInfo(funcOp, translationInfo))) { |
| return failure(); |
| } |
| return success(); |
| } |
| } |
| } |
| |
| SmallVector<Operation *> computeOps = getComputeOps(funcOp); |
| if (IREE::Codegen::TranslationInfoAttr translationInfo = |
| getTranslationInfo(funcOp)) { |
| // Currently some ROCDL requires propagation of user lowering configs. |
| if (needsLoweringConfigPropagation( |
| translationInfo.getDispatchLoweringPassPipeline())) { |
| for (auto op : computeOps) { |
| if (getLoweringConfig(op)) { |
| propagateLoweringConfig(op, computeOps); |
| break; |
| } |
| } |
| } |
| // Translation info (lowering pipeline) is already set. |
| return success(); |
| } |
| |
| Operation *rootOperation = nullptr; |
| |
| // Find the root operation. linalg.generic and linalg.fill are not root |
| // operations if there are other compute operations present. |
| for (Operation *op : llvm::reverse(computeOps)) { |
| if (!isa<linalg::GenericOp, linalg::FillOp>(op)) { |
| rootOperation = op; |
| break; |
| } |
| if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) { |
| // linalg.generic with `reduction` iterator types are roots as well. |
| if (genericOp.getNumLoops() != genericOp.getNumParallelLoops()) { |
| rootOperation = op; |
| break; |
| } |
| } |
| } |
| |
| if (!rootOperation) { |
| for (Operation *op : llvm::reverse(computeOps)) { |
| if (isa<linalg::GenericOp, linalg::FillOp>(op)) { |
| rootOperation = op; |
| break; |
| } |
| } |
| } |
| |
| if (!rootOperation) { |
| // No root operation found, set it to none. |
| auto translationInfo = IREE::Codegen::TranslationInfoAttr::get( |
| funcOp.getContext(), CodeGenPipeline::None); |
| if (failed(setTranslationInfo(funcOp, translationInfo))) { |
| return failure(); |
| } |
| return success(); |
| } |
| |
| if (failed(setRootConfig(target, funcOp, rootOperation))) |
| return funcOp.emitOpError("failed to set root config"); |
| |
| if (IREE::Codegen::TranslationInfoAttr translationInfo = |
| getTranslationInfo(funcOp)) { |
| // Currently some ROCDL requires propagation of user lowering configs. |
| if (!needsLoweringConfigPropagation( |
| translationInfo.getDispatchLoweringPassPipeline())) { |
| return success(); |
| } |
| } |
| |
| propagateLoweringConfig(rootOperation, computeOps); |
| return success(); |
| } |
| |
| } // namespace mlir::iree_compiler |