| // Copyright 2020 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include "iree/compiler/Codegen/SPIRV/KernelConfig.h" |
| |
| #include <functional> |
| #include <numeric> |
| |
| #include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h" |
| #include "iree/compiler/Codegen/Common/UserConfig.h" |
| #include "iree/compiler/Codegen/Dialect/LoweringConfig.h" |
| #include "iree/compiler/Codegen/SPIRV/Utils.h" |
| #include "iree/compiler/Codegen/Transforms/Transforms.h" |
| #include "iree/compiler/Codegen/Utils/GPUUtils.h" |
| #include "iree/compiler/Codegen/Utils/MarkerUtils.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "mlir/Analysis/SliceAnalysis.h" |
| #include "mlir/Dialect/Arith/IR/Arith.h" |
| #include "mlir/Dialect/Linalg/IR/Linalg.h" |
| #include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h" |
| #include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h" |
| #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h" |
| #include "mlir/Dialect/Utils/StaticValueUtils.h" |
| #include "mlir/IR/BuiltinOps.h" |
| #include "mlir/IR/BuiltinTypes.h" |
| #include "mlir/IR/Matchers.h" |
| |
| #define DEBUG_TYPE "iree-spirv-kernel-config" |
| |
| using llvm::APIntOps::GreatestCommonDivisor; |
| |
| // The default number of subgroups to use per workgroup. |
| constexpr unsigned numSubgroupsPerWorkgroup = 4; |
| // The default number of tiles along each dimension to use per workgroup. |
| constexpr unsigned numTilesPerSubgroupDim = 2; |
| |
| constexpr int kMaxVectorNumBits = 128; |
| |
| namespace mlir { |
| namespace iree_compiler { |
| |
| using CodeGenPipeline = IREE::Codegen::DispatchLoweringPassPipeline; |
| |
| //===----------------------------------------------------------------------===// |
| // Utility Functions |
| //===----------------------------------------------------------------------===// |
| |
| bool isMatmulOrBatchMatmul(linalg::LinalgOp linalgOp) { |
| return linalg::isaContractionOpInterface(linalgOp) && |
| llvm::is_contained({2u, 3u}, linalgOp.getNumParallelLoops()); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Convolution Default Configuration |
| //===----------------------------------------------------------------------===// |
| |
| /// Decides the tiling and distribution parameters for one convolution |
| /// dimension. Returns true if we can succesfully deduce. |
| /// |
| /// - `inputDim` is the size of the dimension to be distributed. |
| /// - `residualThreads` is the remaining threads we can distribute. |
| /// - `residualTilingFactor` indicates the remaining tiling scale factor. |
| /// - `wgDimSize` will be updated with the decided workgroup dimension size. |
| /// - `wgTileSize` will be updated with the decided workgroup tile size. |
| static bool tileConvOneDim(const int64_t inputDim, const bool isInnerMostDim, |
| int64_t &residualThreads, |
| int64_t &residualTilingFactor, int64_t &wgDimSize, |
| int64_t &wgTileSize) { |
| const int64_t lb = isInnerMostDim ? 2 : 1; |
| for (int64_t dim = residualThreads; dim >= lb; dim >>= 1) { |
| int64_t chosenTileSize = 0; |
| if (isInnerMostDim) { |
| // Handle 4 elements per thread for the innermost dimension. We need |
| // this for vectorized load. |
| chosenTileSize = 4; |
| if (inputDim % (dim * chosenTileSize) != 0) continue; |
| } else { |
| for (int64_t t = residualTilingFactor; t >= 1; t >>= 1) |
| if (inputDim % (dim * t) == 0) { |
| chosenTileSize = t; |
| break; |
| } |
| } |
| if (chosenTileSize) { |
| wgDimSize = dim; |
| wgTileSize = dim * chosenTileSize; |
| residualThreads /= dim; |
| residualTilingFactor /= chosenTileSize; |
| return true; |
| } |
| } |
| return false; |
| }; |
| |
| /// Decides the tiling and distribution parameters for two convolution window |
| /// dimensions to two workgroup dimensions as a square. Returns true if we can |
| /// succesfully deduce. |
| static bool tileConvSquare(const int64_t oh, const int64_t ow, |
| int64_t &residualThreads, |
| int64_t &residualTilingFactor, |
| MutableArrayRef<int64_t> wgDimSizes, |
| MutableArrayRef<int64_t> wgTileSizes) { |
| assert(wgDimSizes.size() == 2 && wgTileSizes.size() == 2); |
| |
| const unsigned log2Threads = llvm::Log2_64(residualThreads); |
| if (oh == ow && residualThreads != 1 && log2Threads % 2 == 0) { |
| const int64_t yz = 1ll << (log2Threads / 2); |
| |
| int64_t chosenTileSize = 1ll << (llvm::Log2_64(residualTilingFactor) / 2); |
| while (chosenTileSize >= 1 && ow % (yz * chosenTileSize) != 0) { |
| chosenTileSize >>= 1; |
| } |
| |
| if (chosenTileSize != 0) { |
| wgDimSizes.front() = wgDimSizes.back() = yz; |
| wgTileSizes.front() = wgTileSizes.back() = yz * chosenTileSize; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| namespace detail { |
| |
| LogicalResult setConvOpConfig(linalg::LinalgOp linalgOp, |
| const int64_t subgroupSize, |
| const int64_t bestTilingFactor) { |
| LLVM_DEBUG(llvm::dbgs() << "trying to deduce config as convolution...\n"); |
| Type inputType = linalgOp.getDpsInputOperand(0)->get().getType(); |
| ArrayRef<int64_t> inputShape = inputType.cast<ShapedType>().getShape(); |
| Type outputType = linalgOp.getDpsInitOperand(0)->get().getType(); |
| ArrayRef<int64_t> outputShape = outputType.cast<ShapedType>().getShape(); |
| |
| const bool isNCHW = isa<linalg::Conv2DNchwFchwOp>(*linalgOp); |
| const bool isNHWC = isa<linalg::Conv2DNhwcHwcfOp>(*linalgOp) || |
| isa<linalg::DepthwiseConv2DNhwcHwcOp>(*linalgOp); |
| if (!isNCHW & !isNHWC) return success(); |
| |
| const int icIndex = isNHWC ? 3 : 1; |
| const int ohIndex = isNHWC ? 1 : 2; |
| const int owIndex = isNHWC ? 2 : 3; |
| const int ocIndex = isNHWC ? 3 : 1; |
| |
| if (ShapedType::isDynamic(inputShape[icIndex]) || |
| llvm::any_of(outputShape.drop_front(), ShapedType::isDynamic)) { |
| return success(); |
| } |
| |
| const int64_t ic = inputShape[icIndex], oc = outputShape[ocIndex]; |
| const int64_t oh = outputShape[ohIndex], ow = outputShape[owIndex]; |
| |
| // The conversion pipeline requires the input channel dimension to be some |
| // multipler of four, or less than four. |
| if (!(ic % 4 == 0 || ic < 4)) return success(); |
| |
| // The core idea is to distribute the convolution dimensions to the workgroup |
| // Z/Y/X dimensions, with each thread in a workgroup handling multiple vector |
| // elements. We try to 1) utilize all threads in a subgroup, and 2) handle an |
| // optimal tile size along each dimension. |
| |
| int64_t residualThreads = subgroupSize; |
| int64_t residualTilingFactor = bestTilingFactor; |
| |
| SmallVector<int64_t, 3> workgroupSize(3, 1); // (X, Y, Z) |
| SmallVector<int64_t> workgroupTileSizes(4, 0); |
| |
| if (isNCHW) { |
| // OW -> x, OH -> y, OC -> z |
| if (!tileConvOneDim(ow, /*isInnerMostDim=*/true, residualThreads, |
| residualTilingFactor, workgroupSize[0], |
| workgroupTileSizes[3]) || |
| !tileConvOneDim(oh, /*isInnerMostDim=*/false, residualThreads, |
| residualTilingFactor, workgroupSize[1], |
| workgroupTileSizes[2]) || |
| !tileConvOneDim(oc, /*isInnerMostDim=*/false, residualThreads, |
| residualTilingFactor, workgroupSize[2], |
| workgroupTileSizes[1])) { |
| return success(); |
| } |
| } else { |
| // OC -> x |
| if (!tileConvOneDim(oc, /*isInnerMostDim=*/true, residualThreads, |
| residualTilingFactor, workgroupSize[0], |
| workgroupTileSizes[3])) |
| return success(); |
| |
| // Deduce the configruation for the OW and OH dimension. Try to make them |
| // even if possible given we typically have images with the same height |
| // and width. |
| const bool tileToSquare = tileConvSquare( |
| oh, ow, residualThreads, residualTilingFactor, |
| llvm::makeMutableArrayRef(workgroupSize).drop_front(), |
| llvm::makeMutableArrayRef(workgroupTileSizes).drop_front().drop_back()); |
| |
| // Otherwise treat OW and OH separately to allow them to have different |
| // number of threads and tiling size. |
| if (!tileToSquare) { |
| if (!tileConvOneDim(ow, /*isInnerMostDim=*/false, residualThreads, |
| residualTilingFactor, workgroupSize[1], |
| workgroupTileSizes[2]) || |
| !tileConvOneDim(oh, /*isInnerMostDim=*/false, residualThreads, |
| residualTilingFactor, workgroupSize[2], |
| workgroupTileSizes[1])) { |
| return success(); |
| } |
| } |
| } |
| |
| SmallVector<int64_t> threadTileSizes(4, 0); |
| for (int i = 1; i <= 3; ++i) { |
| threadTileSizes[i] = workgroupTileSizes[i] / workgroupSize[3 - i]; |
| } |
| |
| auto pipeline = CodeGenPipeline::SPIRVBaseVectorize; |
| TileSizesListType tileSizes; |
| tileSizes.push_back(workgroupTileSizes); |
| tileSizes.push_back(threadTileSizes); |
| // Tiling along reduction dimensions |
| if (isa<linalg::Conv2DNchwFchwOp>(linalgOp)) { |
| tileSizes.push_back({0, 0, 0, 0, 4, 1, 1}); // (N, OC, OH, OW, IC, FH, FW) |
| } else if (isa<linalg::Conv2DNhwcHwcfOp>(linalgOp)) { |
| tileSizes.push_back({0, 0, 0, 0, 1, 1, 4}); // (N, OH, OW, OC, FH, FW, IC) |
| } else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) { |
| tileSizes.push_back({0, 0, 0, 0, 1, 1}); // (N, OH, OW, C, FH, FW) |
| } else { |
| return success(); |
| } |
| // Tile along OH by size 1 to enable downsizing 2-D convolution to 1-D. |
| SmallVector<int64_t> windowTileSizes(4, 0); |
| windowTileSizes[ohIndex] = 1; |
| tileSizes.push_back(windowTileSizes); |
| |
| auto funcOp = linalgOp->getParentOfType<func::FuncOp>(); |
| return setOpConfigAndEntryPointFnTranslation(funcOp, linalgOp, tileSizes, |
| pipeline, workgroupSize); |
| } |
| |
| } // namespace detail |
| |
| //===----------------------------------------------------------------------===// |
| // Matmul Default Configuration |
| //===----------------------------------------------------------------------===// |
| |
| /// Given the linalg `op` with `lhsShape` and `rhsShape`, tries to treat as a |
| /// (batch) matmul like op and deduce the index of the loop corresponding to |
| /// B/M/N/K dimension respectively. Returns -1 as the index if unable to deduce. |
| std::tuple<int, int, int, int> getMatmulBMNKIndex(linalg::LinalgOp op, |
| int *lastParallelDim) { |
| OpOperand *lhs = op.getDpsInputOperand(0); |
| OpOperand *rhs = op.getDpsInputOperand(1); |
| auto lhsShape = lhs->get().getType().cast<ShapedType>().getShape(); |
| auto rhsShape = rhs->get().getType().cast<ShapedType>().getShape(); |
| |
| auto lhsLoopIndices = llvm::to_vector(llvm::map_range( |
| llvm::seq<int>(0, lhsShape.size()), |
| [&](int i) { return op.getMatchingIndexingMap(lhs).getDimPosition(i); })); |
| auto rhsLoopIndices = llvm::to_vector(llvm::map_range( |
| llvm::seq<int>(0, rhsShape.size()), |
| [&](int i) { return op.getMatchingIndexingMap(rhs).getDimPosition(i); })); |
| |
| // Figure out what dimension each loop corresponds to. |
| int bIndex = -1, mIndex = -1, nIndex = -1, kIndex = -1; |
| for (unsigned i = 0; i < op.getNumLoops(); ++i) { |
| if (linalg::isReductionIterator(op.getIteratorTypesArray()[i])) { |
| kIndex = i; |
| continue; |
| } |
| |
| const bool inLHS = llvm::is_contained(lhsLoopIndices, i); |
| const bool inRHS = llvm::is_contained(rhsLoopIndices, i); |
| if (inLHS && inRHS) { |
| bIndex = i; |
| } else if (inLHS) { |
| // For cases where we have two parallel dimensions only accessed by |
| // the LHS, treat the outer one of them as the batch dimension. |
| if (mIndex >= 0 && bIndex < 0) bIndex = mIndex; |
| mIndex = i; |
| } else if (inRHS) { |
| // For cases where we have two parallel dimensions only accessed by |
| // the RHS, treat the outer one of them as the batch dimension. |
| if (nIndex >= 0 && bIndex < 0) bIndex = nIndex; |
| nIndex = i; |
| } |
| if (lastParallelDim) *lastParallelDim = i; |
| } |
| |
| LLVM_DEBUG({ |
| llvm::dbgs() << "(B, M, N, K) indices = (" << bIndex << ", " << mIndex |
| << ", " << nIndex << ", " << kIndex << ")\n"; |
| }); |
| return {bIndex, mIndex, nIndex, kIndex}; |
| } |
| |
| /// Decides the tiling and distribution parameters for matmul's N dimension to |
| /// workgroup X dimension. |
| static bool tileMatmulNToWorkgroupX(const int64_t dimN, |
| const int64_t bestThreadN, |
| int64_t &residualThreads, |
| const int64_t bestX, |
| int64_t &residualTilingFactor, |
| int64_t &wgDimSize, int64_t &wgTileSize) { |
| // Deduce the configuration for the N dimension. Start with the best workgroup |
| // X size, and reduce by a factor of two each time. |
| for (int64_t x = bestX; x >= 2; x >>= 1) { |
| // Handle 4 elements per thread for the innermost dimension. We need this |
| // for vectorized load. |
| int64_t chosenTileSize = bestThreadN; |
| if (dimN % (x * chosenTileSize) == 0) { |
| wgDimSize = x; |
| wgTileSize = x * chosenTileSize; |
| residualThreads /= x; |
| assert(residualTilingFactor % chosenTileSize == 0); |
| residualTilingFactor /= chosenTileSize; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /// Decides the tiling and distribution parameters for matmul's M dimension to |
| /// workgroup Y dimension. |
| static bool tileMatmulMToWorkgroupY(const int64_t dimM, |
| const int64_t bestThreadM, |
| int64_t &residualThreads, |
| const int64_t bestY, |
| int64_t &residualTilingFactor, |
| int64_t &wgDimSize, int64_t &wgTileSize) { |
| // Deduce the configuration for the M dimension. Start with the best workgroup |
| // Y size, and reduce by a factor of two each time. |
| for (int64_t y = residualThreads; y >= 1; y >>= 1) { |
| int64_t chosenTileSize = 0; |
| // Reduce the thread tiling size by one each time. We read one row each |
| // time; so it's fine to not be some power of two here. |
| for (int64_t t = bestThreadM; t >= 1; --t) { |
| if (dimM % (y * t) == 0) { |
| chosenTileSize = t; |
| break; |
| } |
| } |
| if (chosenTileSize) { |
| wgDimSize = y; |
| wgTileSize = y * chosenTileSize; |
| assert(residualTilingFactor > chosenTileSize); |
| residualTilingFactor -= chosenTileSize; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /// Decides the tiling parameters for matmul's K dimension. |
| static bool tileMatmulK(const int64_t dimK, const int64_t residualTilingFactor, |
| int64_t &tileSize) { |
| // Deduce the configuration for the K dimension. We need some power of two |
| // here so that we can do vector load. |
| for (int64_t t = llvm::PowerOf2Floor(residualTilingFactor); t >= 2; t >>= 1) { |
| if (dimK % t == 0) { |
| tileSize = t; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| int64_t getTileBytes(int64_t mTileSize, int64_t nTileSize, int64_t kTileSize, |
| int64_t elementBits) { |
| const int64_t count = |
| (mTileSize + nTileSize) * |
| (kTileSize + detail::bankConflictReductionPaddingBits / elementBits); |
| return (elementBits / 8) * count; |
| } |
| |
| int64_t getMultiBufferMemoryUsage(int64_t singleBufferBytes, unsigned depth) { |
| return singleBufferBytes * (depth ? depth : 1); |
| }; |
| |
| /// Tries to adjust workgroup and tile sizes to enable vector load for both |
| /// matmul LHS and RHS. Returns false only when it's not beneficial to promote. |
| static bool adjustToVectorLoad(ArrayRef<int64_t> dimMNKSize, int64_t &mTileSize, |
| int64_t &nTileSize, int64_t &kTileSize, |
| SmallVectorImpl<int64_t> &wgSize, |
| const int64_t subgroupSize, int64_t vectorSize) { |
| const int64_t totalThreads = wgSize[0] * wgSize[1] * wgSize[2]; |
| LLVM_DEBUG(llvm::dbgs() << "initial total thread = " << totalThreads << "\n"); |
| if (totalThreads <= subgroupSize) return false; |
| |
| const bool canVectorLoadLHS = canPerformVectorAccessUsingAllThreads( |
| {mTileSize, kTileSize}, totalThreads, vectorSize); |
| const bool canVectorLoadRHS = canPerformVectorAccessUsingAllThreads( |
| {kTileSize, nTileSize}, totalThreads, vectorSize); |
| LLVM_DEBUG(llvm::dbgs() << "LHS vector load: " << canVectorLoadLHS << "\n"); |
| LLVM_DEBUG(llvm::dbgs() << "RHS vector load: " << canVectorLoadRHS << "\n"); |
| |
| // If we can perform vector load of neither, just don't use shared memory. |
| if (!canVectorLoadLHS && !canVectorLoadRHS) return false; |
| |
| // If we can only perform vector load of one operands, adjust the tiling |
| // scheme to see if we can make both work. Increase K to load more data for |
| // the smaller tile; decrease M or N, for the larger tile. |
| if (canVectorLoadLHS && !canVectorLoadRHS) { |
| for (const int scale : {2, 4}) { |
| const int64_t newKTileSize = kTileSize * scale; |
| if (dimMNKSize[2] % newKTileSize != 0) continue; |
| const int64_t newMTileSize = mTileSize / scale; |
| const int64_t newWgMDim = wgSize[1] / scale; |
| if (newMTileSize == 0 || newWgMDim == 0) continue; |
| const int64_t newCount = wgSize[0] * newWgMDim * wgSize[2]; |
| if (newCount <= subgroupSize) continue; |
| if (!canPerformVectorAccessUsingAllThreads({newMTileSize, newKTileSize}, |
| newCount, vectorSize) || |
| !canPerformVectorAccessUsingAllThreads({newKTileSize, nTileSize}, |
| newCount, vectorSize)) { |
| continue; |
| } |
| LLVM_DEBUG({ |
| llvm::dbgs() << "initial [M, N, K] tile size = [" << mTileSize << ", " |
| << nTileSize << ", " << kTileSize << "]\n"; |
| llvm::dbgs() << "revised [M, N, K] tile size = [" << newMTileSize |
| << ", " << nTileSize << ", " << newKTileSize << "]\n"; |
| }); |
| mTileSize = newMTileSize; |
| kTileSize = newKTileSize; |
| wgSize[1] = newWgMDim; |
| break; |
| } |
| } |
| // TODO: improve (!canVectorLoadLHS && canVectorLoadRHS) |
| |
| return true; |
| } |
| |
| /// Tries to adjust workgorup and tile sizes to promote matmul LHS and RHS and |
| /// returns true if it's beneficial to promote. |
| static bool adjustToPromote(ArrayRef<int64_t> dimMNKSize, int64_t &mTileSize, |
| int64_t &nTileSize, int64_t &kTileSize, |
| SmallVectorImpl<int64_t> &wgSize, |
| unsigned &pipelineDepth, const int subgroupSize, |
| const int maxBytes, const int elementBits) { |
| LLVM_DEBUG(llvm::dbgs() << "subgroup size = " << subgroupSize << "\n"); |
| const int vectorSize = kMaxVectorNumBits / elementBits; |
| if (!adjustToVectorLoad(dimMNKSize, mTileSize, nTileSize, kTileSize, wgSize, |
| subgroupSize, vectorSize)) |
| return false; |
| |
| // Don't do multibuffering if the inner reduction loop is folded out. |
| if (dimMNKSize[2] == kTileSize) pipelineDepth = 1; |
| |
| auto usedBytes = getTileBytes(mTileSize, nTileSize, kTileSize, elementBits); |
| |
| LLVM_DEBUG(llvm::dbgs() << "initial multibuffering bytes = " |
| << getMultiBufferMemoryUsage(usedBytes, pipelineDepth) |
| << "\n"); |
| |
| // First try to fit the given tile sizes with the largest pipelining depth |
| // possible. |
| do { |
| if (getMultiBufferMemoryUsage(usedBytes, pipelineDepth) <= maxBytes) |
| return true; |
| } while (pipelineDepth-- > 1); |
| |
| // If we can't fit in workgroup memory, don't multibuffer. |
| pipelineDepth = 1; |
| |
| // Using too much workgroup memory. Try to reduce the tile size for X/Y once |
| // by a factor of two. |
| int64_t &wgDimSize = wgSize[0] > wgSize[1] ? wgSize[0] : wgSize[1]; |
| int64_t &tileSize = wgSize[0] > wgSize[1] ? nTileSize : mTileSize; |
| assert(wgDimSize % 2 == 0); |
| wgDimSize /= 2; |
| tileSize /= 2; |
| |
| int64_t totalThreads = wgSize[0] * wgSize[1] * wgSize[2]; |
| LLVM_DEBUG(llvm::dbgs() << "revised total thread = " << totalThreads << "\n"); |
| usedBytes = getTileBytes(mTileSize, nTileSize, kTileSize, elementBits); |
| LLVM_DEBUG(llvm::dbgs() << "revised tile bytes = " << usedBytes << "\n"); |
| return totalThreads > subgroupSize && usedBytes <= maxBytes; |
| } |
| |
| namespace detail { |
| |
| LogicalResult setMatmulOpConfig(spirv::ResourceLimitsAttr limits, |
| linalg::LinalgOp op, |
| std::array<int64_t, 2> bestWorkgroupSizeXY, |
| std::array<int64_t, 3> bestThreadTileSizeMNK, |
| bool enablePromotion, |
| unsigned softwarePipelineDepth) { |
| LLVM_DEBUG(llvm::dbgs() << "trying to deduce config as matmul...\n"); |
| OpOperand *lhs = op.getDpsInputOperand(0); |
| OpOperand *rhs = op.getDpsInputOperand(1); |
| |
| auto lhsType = lhs->get().getType().cast<ShapedType>(); |
| auto rhsType = rhs->get().getType().cast<ShapedType>(); |
| auto elementBits = lhsType.getElementType().getIntOrFloatBitWidth(); |
| if (elementBits != 16 && elementBits != 32) return success(); |
| |
| ArrayRef<int64_t> lhsShape = lhsType.getShape(); |
| ArrayRef<int64_t> rhsShape = rhsType.getShape(); |
| if (llvm::any_of(lhsShape, ShapedType::isDynamic)) return success(); |
| if (llvm::any_of(rhsShape, ShapedType::isDynamic)) return success(); |
| |
| assert(llvm::is_contained({2u, 3u}, op.getNumParallelLoops())); |
| |
| int lastParallelDim = -1; |
| const auto [bIndex, mIndex, nIndex, kIndex] = |
| getMatmulBMNKIndex(op, &lastParallelDim); |
| if (mIndex < 0 || nIndex < 0 || kIndex < 0) return success(); |
| const bool isBM = bIndex >= 0; |
| |
| SmallVector<int64_t, 4> loopRanges = op.getStaticLoopRanges(); |
| const unsigned numLoops = loopRanges.size(); |
| |
| const int64_t dimM = loopRanges[mIndex]; |
| const int64_t dimK = loopRanges[kIndex]; |
| const int64_t dimN = loopRanges[nIndex]; |
| |
| // The core idea is to distribute the matmul M/N dimension to the workgroup |
| // Y/X dimension, with each thread in a workgroup handling multiple vector |
| // elements. We start from the best (X, Y) and the tiling sizes for (M, N, K) |
| // and try different configurations by scaling them down until we find a |
| // configuration that can perfectly tile the input matmul. |
| |
| const int64_t bestThreadM = bestThreadTileSizeMNK[0], |
| bestThreadN = bestThreadTileSizeMNK[1], |
| bestThreadK = bestThreadTileSizeMNK[2]; |
| |
| int64_t bestX = bestWorkgroupSizeXY[0], bestY = bestWorkgroupSizeXY[1]; |
| // We will deduce a configuration first for x and then y. But look at y here |
| // to see if the problem size is too small; for such cases, "shift" the |
| // parallelism to x. |
| if (dimM < bestThreadM) { |
| int64_t factor = llvm::PowerOf2Ceil(llvm::divideCeil(bestThreadM, dimM)); |
| bestX *= factor; |
| bestY = llvm::divideCeil(bestY, factor); |
| } |
| |
| LLVM_DEBUG({ |
| llvm::dbgs() << "best thread tile size (M, N, K) = (" << bestThreadM << ", " |
| << bestThreadN << ", " << bestThreadK << ")\n"; |
| llvm::dbgs() << "best workgroup size (X, Y) = (" << bestX << ", " << bestY |
| << ")\n"; |
| }); |
| |
| int64_t residualThreads = bestX * bestY; |
| int64_t residualTilingFactor = (bestThreadM + bestThreadK) * bestThreadN; |
| |
| SmallVector<int64_t, 3> workgroupSize(3, 1); // (X, Y, Z) |
| SmallVector<int64_t> workgroupTileSizes(numLoops, 0); |
| SmallVector<int64_t> reductionTileSizes(numLoops, 0); |
| |
| if (isBM) workgroupTileSizes[bIndex] = 1; |
| |
| if (!tileMatmulNToWorkgroupX(dimN, bestThreadN, residualThreads, bestX, |
| residualTilingFactor, workgroupSize[0], |
| workgroupTileSizes[nIndex]) || |
| !tileMatmulMToWorkgroupY(dimM, bestThreadM, residualThreads, bestY, |
| residualTilingFactor, workgroupSize[1], |
| workgroupTileSizes[mIndex]) || |
| !tileMatmulK(dimK, residualTilingFactor, reductionTileSizes[kIndex])) { |
| return success(); |
| } |
| LLVM_DEBUG({ |
| llvm::dbgs() << "workgroup tile size before promotion = ("; |
| llvm::interleaveComma(workgroupTileSizes, llvm::dbgs()); |
| llvm::dbgs() << ")\n"; |
| llvm::dbgs() << "reduction tile size before promotion = ("; |
| llvm::interleaveComma(reductionTileSizes, llvm::dbgs()); |
| llvm::dbgs() << ")\n"; |
| llvm::dbgs() << "workgroup size before promotion = ("; |
| llvm::interleaveComma(workgroupSize, llvm::dbgs()); |
| llvm::dbgs() << ")\n"; |
| }); |
| |
| const int subgroupSize = limits.getSubgroupSize(); |
| const int maxBytes = limits.getMaxComputeSharedMemorySize(); |
| |
| // We want a 2-stage pipeline without multi-buffering if the depth is 0 to |
| // keep the default for compilation configs that don't specify a pipeline |
| // depth. |
| auto pipelineDepth = softwarePipelineDepth ? softwarePipelineDepth : 1; |
| |
| // Try to adjust tiling sizes to fit in shared memory. |
| auto usePromotionPipeline = |
| enablePromotion && |
| adjustToPromote({dimM, dimN, dimK}, workgroupTileSizes[mIndex], |
| workgroupTileSizes[nIndex], reductionTileSizes[kIndex], |
| workgroupSize, pipelineDepth, subgroupSize, maxBytes, |
| elementBits); |
| |
| SmallVector<int64_t> threadTileSizes(numLoops, 0); |
| if (isBM) { |
| threadTileSizes[bIndex] = workgroupTileSizes[bIndex] / workgroupSize[2]; |
| } |
| threadTileSizes[mIndex] = workgroupTileSizes[mIndex] / workgroupSize[1]; |
| threadTileSizes[nIndex] = workgroupTileSizes[nIndex] / workgroupSize[0]; |
| |
| TileSizesListType tileSizes; |
| workgroupTileSizes.resize(lastParallelDim + 1); |
| threadTileSizes.resize(lastParallelDim + 1); |
| tileSizes.push_back(workgroupTileSizes); |
| tileSizes.push_back(threadTileSizes); |
| tileSizes.push_back(reductionTileSizes); |
| |
| // Only the promotion pipeline has multibuffering + pipelining. |
| if (usePromotionPipeline) { |
| return setOpConfigAndEntryPointFnTranslation( |
| op->getParentOfType<func::FuncOp>(), op, tileSizes, |
| CodeGenPipeline::SPIRVMatmulPromoteVectorize, workgroupSize, |
| pipelineDepth); |
| } |
| |
| return setOpConfigAndEntryPointFnTranslation( |
| op->getParentOfType<func::FuncOp>(), op, tileSizes, |
| CodeGenPipeline::SPIRVBaseVectorize, workgroupSize); |
| } |
| |
| } // namespace detail |
| |
| //===----------------------------------------------------------------------===// |
| // Cooperative Matrix Default Configuration |
| //===----------------------------------------------------------------------===// |
| |
| struct CooperativeMatrixSize { |
| int64_t mSize; // Native cooperative matrix size along M dimension |
| int64_t nSize; // Native cooperative matrix size along N dimension |
| int64_t kSize; // Native cooperative matrix size along K dimension |
| int64_t mWarpCount; // # subgroups along M dimension |
| int64_t nWarpCount; // # subgroups along N dimension |
| int64_t mTileCount; // # tiles per subgroup along M dimension |
| int64_t nTileCount; // # tiles per subgroup along N dimension |
| int64_t kTileCount; // # tiles along K dimension |
| }; |
| |
| /// Returns the cooperative matrix (M, N, K) sizes that are supported by the |
| /// target environment and match the given parameters. |
| static Optional<CooperativeMatrixSize> getCooperativeMatrixSize( |
| spirv::ResourceLimitsAttr resourceLimits, Type aType, Type bType, |
| Type cType, int64_t m, int64_t n, int64_t k) { |
| auto properties = resourceLimits.getCooperativeMatrixPropertiesNv() |
| .getAsRange<spirv::CooperativeMatrixPropertiesNVAttr>(); |
| for (auto property : properties) { |
| if (property.getAType() != aType || property.getBType() != bType || |
| property.getCType() != cType || property.getResultType() != cType || |
| property.getScope().getValue() != spirv::Scope::Subgroup) { |
| continue; // Cannot use this cooperative matrix configuration |
| } |
| |
| const unsigned matmulM = property.getMSize(); |
| const unsigned matmulN = property.getNSize(); |
| const unsigned matmulK = property.getKSize(); |
| if (m % matmulM != 0 || n % matmulN != 0 || k % matmulK != 0) continue; |
| |
| uint64_t nTotalTileCount = n / matmulN; |
| uint64_t mTotalTileCount = m / matmulM; |
| |
| uint64_t remainingWarps = numSubgroupsPerWorkgroup; |
| uint64_t remainingTiles = numTilesPerSubgroupDim * numTilesPerSubgroupDim; |
| uint64_t warpSqrt = 1ull << (llvm::Log2_64(remainingWarps) / 2); |
| uint64_t tileSqrt = 1ull << (llvm::Log2_64(remainingTiles) / 2); |
| |
| int64_t mWarpCount = 0, nWarpCount = 0; |
| int64_t mTileCount = 0, nTileCount = 0; |
| |
| // See if the square root can divide mTotalTileCount. If so it means we can |
| // distribute to both dimensions evenly. Otherwise, try to distribute to N |
| // and then M. |
| if (mTotalTileCount > (warpSqrt * tileSqrt) && |
| mTotalTileCount % (warpSqrt * tileSqrt) == 0) { |
| mWarpCount = warpSqrt; |
| mTileCount = tileSqrt; |
| |
| remainingWarps /= warpSqrt; |
| remainingTiles /= tileSqrt; |
| |
| APInt nGCD = GreatestCommonDivisor(APInt(64, nTotalTileCount), |
| APInt(64, remainingWarps)); |
| nWarpCount = nGCD.getSExtValue(); |
| nTotalTileCount /= nWarpCount; |
| remainingWarps /= nWarpCount; |
| |
| nGCD = GreatestCommonDivisor(APInt(64, nTotalTileCount), |
| APInt(64, remainingTiles)); |
| nTileCount = nGCD.getSExtValue(); |
| } else { |
| APInt nGCD = GreatestCommonDivisor(APInt(64, nTotalTileCount), |
| APInt(64, remainingWarps)); |
| nWarpCount = nGCD.getSExtValue(); |
| nTotalTileCount /= nWarpCount; |
| remainingWarps /= nWarpCount; |
| |
| nGCD = GreatestCommonDivisor(APInt(64, nTotalTileCount), |
| APInt(64, remainingTiles)); |
| nTileCount = nGCD.getSExtValue(); |
| remainingTiles /= nTileCount; |
| |
| APInt mGCD = GreatestCommonDivisor(APInt(64, mTotalTileCount), |
| APInt(64, remainingWarps)); |
| mWarpCount = mGCD.getSExtValue(); |
| mTotalTileCount /= mWarpCount; |
| remainingWarps /= mWarpCount; |
| |
| mGCD = GreatestCommonDivisor(APInt(64, mTotalTileCount), |
| APInt(64, remainingTiles)); |
| mTileCount = mGCD.getSExtValue(); |
| } |
| |
| const uint64_t kTotalTileCount = k / matmulK; |
| APInt kGCD = GreatestCommonDivisor(APInt(64, kTotalTileCount), |
| APInt(64, numTilesPerSubgroupDim)); |
| int64_t kTileCount = kGCD.getSExtValue(); |
| |
| LLVM_DEBUG({ |
| llvm::dbgs() << "chosen cooperative matrix configuration:\n"; |
| llvm::dbgs() << " (M, N, K) size = (" << matmulM << ", " << matmulN |
| << ", " << matmulK << ")\n"; |
| llvm::dbgs() << " (M, N) subgroup count = (" << mWarpCount << ", " |
| << nWarpCount << ")\n"; |
| llvm::dbgs() << " (M, N, K) tile count per subgroup = (" << mTileCount |
| << ", " << nTileCount << ", " << kTileCount << ")\n"; |
| }); |
| return CooperativeMatrixSize{matmulM, matmulN, matmulK, |
| mWarpCount, nWarpCount, mTileCount, |
| nTileCount, kTileCount}; |
| } |
| return llvm::None; |
| } |
| |
| namespace detail { |
| |
| LogicalResult setCooperativeMatrixConfig(const spirv::TargetEnv &targetEnv, |
| linalg::LinalgOp op) { |
| LLVM_DEBUG(llvm::dbgs() << "trying to matmul tensorcore config...\n"); |
| // This configuration is only for cooperative matrix. |
| if (!targetEnv.allows(spirv::Capability::CooperativeMatrixNV) || |
| !targetEnv.allows(spirv::Extension::SPV_NV_cooperative_matrix)) { |
| return success(); |
| } |
| |
| if (op.hasDynamicShape()) return success(); |
| |
| Value lhs = op.getDpsInputOperand(0)->get(); |
| Value rhs = op.getDpsInputOperand(1)->get(); |
| Value init = op.getDpsInitOperand(0)->get(); |
| |
| int lastParallelDim = -1; |
| const auto [bIndex, mIndex, nIndex, kIndex] = |
| getMatmulBMNKIndex(op, &lastParallelDim); |
| if (mIndex < 0 || nIndex < 0 || kIndex < 0) return success(); |
| const bool isBM = bIndex >= 0; |
| |
| SmallVector<int64_t, 4> loopRanges = op.getStaticLoopRanges(); |
| |
| const int64_t dimM = loopRanges[mIndex]; |
| const int64_t dimK = loopRanges[kIndex]; |
| const int64_t dimN = loopRanges[nIndex]; |
| LLVM_DEBUG({ |
| llvm::dbgs() << "input matmul shape (B, M, N, K) = (" |
| << (bIndex >= 0 ? loopRanges[bIndex] : -1) << ", " << dimM |
| << ", " << dimN << ", " << dimK << ")\n"; |
| }); |
| |
| // TODO: Cooperative matrix support is fairly restricted. We can only have |
| // a curated list of fused element wise ops as defined in the extension |
| // SPV_NV_cooperative_matrix. Check that once we move bufferization after |
| // vectorization. |
| |
| auto getElementType = [](Value v) { |
| return v.getType().cast<ShapedType>().getElementType(); |
| }; |
| |
| spirv::ResourceLimitsAttr resourceLimits = targetEnv.getResourceLimits(); |
| Optional<CooperativeMatrixSize> coopMatSize = getCooperativeMatrixSize( |
| resourceLimits, getElementType(lhs), getElementType(rhs), |
| getElementType(init), dimM, dimN, dimK); |
| if (!coopMatSize) return success(); |
| |
| auto pipeline = IREE::Codegen::DispatchLoweringPassPipeline:: |
| SPIRVCooperativeMatrixVectorize; |
| |
| std::array<int64_t, 3> workgroupSize{ |
| coopMatSize->nWarpCount * resourceLimits.getSubgroupSize(), |
| coopMatSize->mWarpCount, 1}; |
| |
| SmallVector<int64_t> vectorSizes(kIndex + 1, 0); |
| if (isBM) vectorSizes[bIndex] = 1; |
| vectorSizes[mIndex] = coopMatSize->mSize; |
| vectorSizes[nIndex] = coopMatSize->nSize; |
| vectorSizes[kIndex] = coopMatSize->kSize; |
| |
| SmallVector<int64_t> subgroupTileSizes(lastParallelDim + 1, 0); |
| if (isBM) subgroupTileSizes[bIndex] = 1; |
| subgroupTileSizes[mIndex] = coopMatSize->mTileCount * vectorSizes[mIndex]; |
| subgroupTileSizes[nIndex] = coopMatSize->nTileCount * vectorSizes[nIndex]; |
| |
| SmallVector<int64_t> workgroupTileSizes(lastParallelDim + 1, 0); |
| if (isBM) workgroupTileSizes[bIndex] = 1; |
| workgroupTileSizes[mIndex] = |
| coopMatSize->mWarpCount * subgroupTileSizes[mIndex]; |
| workgroupTileSizes[nIndex] = |
| coopMatSize->nWarpCount * subgroupTileSizes[nIndex]; |
| |
| // Also create one level for reduction. This is needed because of |
| // SPIRVTileAndPromotePass requires it. |
| // TODO(#10499): Consolidate tiling configuration across different pipelines. |
| SmallVector<int64_t> reductionTileSizes; |
| reductionTileSizes.append(kIndex, 0); |
| reductionTileSizes.push_back(coopMatSize->kTileCount * coopMatSize->kSize); |
| |
| TileSizesListType tileSizes; |
| tileSizes.reserve(3); |
| tileSizes.push_back(workgroupTileSizes); |
| tileSizes.push_back(subgroupTileSizes); |
| tileSizes.push_back(reductionTileSizes); |
| tileSizes.push_back(vectorSizes); |
| |
| return setOpConfigAndEntryPointFnTranslation( |
| op->getParentOfType<func::FuncOp>(), op, tileSizes, pipeline, |
| workgroupSize); |
| } |
| |
| } // namespace detail |
| |
| //===----------------------------------------------------------------------===// |
| // FFT Default Configuration |
| //===----------------------------------------------------------------------===// |
| |
| static LogicalResult setFftOpConfig(spirv::ResourceLimitsAttr limits, |
| IREE::LinalgExt::FftOp op) { |
| LLVM_DEBUG(llvm::dbgs() << "trying to deduce config as fft...\n"); |
| const int subgroupSize = limits.getSubgroupSize(); |
| auto pipeline = CodeGenPipeline::SPIRVBaseDistribute; |
| |
| std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1}; |
| |
| SmallVector<utils::IteratorType> loopIteratorTypes = |
| op.getLoopIteratorTypes(); |
| unsigned loopDepth = loopIteratorTypes.size(); |
| SmallVector<int64_t> workgroupTileSize(loopDepth, 0); |
| |
| // Tiling along partitioned loops with size 1. |
| for (auto iteratorType : llvm::enumerate(loopIteratorTypes)) { |
| if (iteratorType.value() == utils::IteratorType::parallel) { |
| workgroupTileSize[iteratorType.index()] = 1; |
| } |
| } |
| auto rank = op.getOperandRank(); |
| if (workgroupTileSize.size() >= rank && workgroupTileSize[rank - 1] != 0) { |
| APInt value; |
| if (matchPattern(op.getStage(), m_ConstantInt(&value))) { |
| workgroupTileSize[rank - 1] = 1ll << value.getSExtValue(); |
| } else { |
| op.emitError("non-constant stage might not work for fft op"); |
| return failure(); |
| } |
| } |
| TileSizesListType tileSizes = {workgroupTileSize}; |
| return setOpConfigAndEntryPointFnTranslation( |
| op->getParentOfType<func::FuncOp>(), op, tileSizes, pipeline, |
| workgroupSize); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Reduction Default Configuration |
| //===----------------------------------------------------------------------===// |
| |
| /// Set the configuration for reductions that can be mapped to warp reductions. |
| static LogicalResult setReductionConfig(const spirv::TargetEnv &targetEnv, |
| linalg::GenericOp op) { |
| LLVM_DEBUG(llvm::dbgs() << "trying to deduce config as reduction...\n"); |
| if (op.hasDynamicShape()) return failure(); |
| // This pipeline eventually generates non-uniform group shuffle ops, which |
| // requires special capability. |
| if (!targetEnv.allows(spirv::Capability::GroupNonUniformShuffle)) |
| return failure(); |
| |
| SmallVector<unsigned> reductionDims; |
| op.getReductionDims(reductionDims); |
| if (reductionDims.size() != 1 || reductionDims[0] != op.getNumLoops() - 1) |
| return failure(); |
| if (op.getRegionOutputArgs().size() != 1) return failure(); |
| |
| // Only support projected permutation for now. This could be extended to |
| // projected permutated with broadcast. |
| if (llvm::any_of(op.getDpsInputOperands(), [&](OpOperand *input) { |
| return !op.getMatchingIndexingMap(input).isProjectedPermutation(); |
| })) { |
| return failure(); |
| } |
| |
| // Only support single combiner operations for now. |
| SmallVector<Operation *, 4> combinerOps; |
| if (!matchReduction(op.getRegionOutputArgs(), 0, combinerOps) || |
| combinerOps.size() != 1) { |
| return failure(); |
| } |
| |
| const int subgroupSize = targetEnv.getResourceLimits().getSubgroupSize(); |
| Optional<int64_t> dimSize = op.getStaticLoopRanges()[reductionDims[0]]; |
| if (!dimSize || *dimSize % subgroupSize != 0) return failure(); |
| |
| const Type elementType = |
| op.getOutputs()[0].getType().cast<ShapedType>().getElementType(); |
| if (!elementType.isIntOrFloat()) return failure(); |
| unsigned bitWidth = elementType.getIntOrFloatBitWidth(); |
| // Reduction distribution only supports 32-bit types now. |
| if (bitWidth != 32) return failure(); |
| |
| // Let each thread handle `vectorSize` elements. |
| unsigned vectorSize = kMaxVectorNumBits / bitWidth; |
| while ((*dimSize / vectorSize) % subgroupSize != 0) vectorSize /= 2; |
| |
| // TODO: Add reduction tiling to handle larger reductions. |
| const int64_t maxWorkgroupSize = |
| targetEnv.getResourceLimits().getMaxComputeWorkgroupInvocations(); |
| int64_t groupSize = *dimSize / vectorSize; |
| if (groupSize > maxWorkgroupSize) { |
| groupSize = llvm::APIntOps::GreatestCommonDivisor( |
| {64, uint64_t(groupSize)}, {64, uint64_t(maxWorkgroupSize)}) |
| .getZExtValue(); |
| } |
| // Current warp reduction pattern is a two step butterfly warp reduce. |
| // First, do warp reductions along multiple subgroups. |
| // Second, reduce results from multiple subgroups using single warp reduce. |
| // The final warp reduce requires numSubgroupUsed > subgroupSize to work. |
| // TODO(raikonenfnu): Add flexible num of warp reduce to handle more configs. |
| // TT::CPU and TT::ARM_Valhall is not going through warp reduce. |
| const int64_t numSubgroupsUsed = groupSize / subgroupSize; |
| if (numSubgroupsUsed > subgroupSize) { |
| return failure(); |
| } |
| std::array<int64_t, 3> workgroupSize = {groupSize, 1, 1}; |
| // Tile all the parallel dimension to 1. |
| SmallVector<unsigned> partitionedLoops = |
| cast<PartitionableLoopsInterface>(op.getOperation()) |
| .getPartitionableLoops(kNumMaxParallelDims); |
| llvm::SmallDenseSet<unsigned, 4> partitionedLoopsSet; |
| partitionedLoopsSet.insert(partitionedLoops.begin(), partitionedLoops.end()); |
| size_t numLoops = partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1; |
| SmallVector<int64_t, 4> workgroupTileSizes(numLoops, 1); |
| SmallVector<int64_t, 4> reductionTileSizes(numLoops, 0); |
| reductionTileSizes.push_back(groupSize * vectorSize); |
| |
| TileSizesListType tileSizes; |
| tileSizes.emplace_back(std::move(workgroupTileSizes)); // Workgroup level |
| tileSizes.emplace_back(std::move(reductionTileSizes)); // reduction level |
| |
| return setOpConfigAndEntryPointFnTranslation( |
| op->getParentOfType<func::FuncOp>(), op, tileSizes, |
| CodeGenPipeline::SPIRVSubgroupReduce, workgroupSize); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Everything Default Configuration |
| //===----------------------------------------------------------------------===// |
| |
| static LogicalResult setDefaultOpConfig(spirv::ResourceLimitsAttr limits, |
| Operation *op, |
| bool allowVectorization = true) { |
| LLVM_DEBUG(llvm::dbgs() << "trying to deduce as default op...\n"); |
| func::FuncOp funcOp = op->getParentOfType<func::FuncOp>(); |
| auto interfaceOp = cast<PartitionableLoopsInterface>(*op); |
| auto partitionedLoops = |
| interfaceOp.getPartitionableLoops(kNumMaxParallelDims); |
| |
| // Special case for not tiled ops. |
| if (partitionedLoops.empty()) { |
| // No tiled loops means we cannot tile (and distribute) at all. Use just one |
| // single thread to run everything. |
| auto pipeline = CodeGenPipeline::SPIRVBaseDistribute; |
| std::array<int64_t, 3> workgroupSize = {1, 1, 1}; |
| return setOpConfigAndEntryPointFnTranslation(funcOp, op, {}, pipeline, |
| workgroupSize); |
| } |
| |
| const int subgroupSize = limits.getSubgroupSize(); |
| const unsigned loopDepth = partitionedLoops.back() + 1; |
| |
| // Configurations we need to decide. |
| std::array<int64_t, 3> workgroupSize; |
| SmallVector<int64_t> workgroupTileSizes; |
| SmallVector<int64_t> threadTileSizes; |
| |
| // Initialize the configuration. |
| auto initConfiguration = [&]() { |
| workgroupSize = {subgroupSize, 1, 1}; |
| workgroupTileSizes.resize(loopDepth, 0); |
| threadTileSizes.resize(loopDepth, 0); |
| |
| // Initialize tiling along all partitioned loops with size 1. |
| for (int64_t loopIndex : partitionedLoops) { |
| workgroupTileSizes[loopIndex] = threadTileSizes[loopIndex] = 1; |
| } |
| // Override the innermost dimension to distribute to threads in a subgroup. |
| workgroupTileSizes.back() = subgroupSize; |
| threadTileSizes.back() = 1; |
| }; |
| |
| // Special case for non-linalg ops. |
| auto linalgOp = dyn_cast<linalg::LinalgOp>(op); |
| if (!linalgOp || linalgOp.getNumDpsInits() != 1) { |
| auto pipeline = CodeGenPipeline::SPIRVBaseDistribute; |
| |
| initConfiguration(); |
| TileSizesListType tileSizes; |
| tileSizes.push_back(workgroupTileSizes); |
| tileSizes.push_back(threadTileSizes); |
| |
| return setOpConfigAndEntryPointFnTranslation(funcOp, op, tileSizes, |
| pipeline, workgroupSize); |
| } |
| |
| // Common case for all linalg ops. |
| |
| // The core idea is to distribute the partitioned loops to the workgroup |
| // dimensions. The goal is to fill up the GPU as much as possible, which means |
| // 1) distributing to as many threads as possible, and 2) avoid assigning too |
| // many threads to handle out-of-bound elements (thus idle). |
| |
| // Returns true if the given `operand` has 32-bit element type. |
| auto has32BitElementType = [](Value operand) { |
| auto shapedType = operand.getType().dyn_cast<ShapedType>(); |
| Type elementType = |
| (shapedType ? shapedType.getElementType() : operand.getType()); |
| return elementType.isa<FloatType>() || elementType.isInteger(32); |
| }; |
| |
| // Whether we can try to use the vectorization pipeline. |
| SmallVector<int64_t, 4> loopBounds = linalgOp.getStaticLoopRanges(); |
| bool vectorizable = |
| allowVectorization && |
| // The vectorization pipeline assumes tensor semantics for tiling. |
| linalgOp.hasTensorSemantics() && !linalgOp.hasIndexSemantics() && |
| // Require all affine maps to be projected permutation so that we can |
| // generate vector transfer ops. |
| llvm::all_of( |
| linalgOp.getIndexingMapsArray(), |
| [](AffineMap map) { return map.isProjectedPermutation(); }) && |
| // TODO: Fix non-32-bit element type vectorization and remove this. |
| llvm::all_of(linalgOp->getOperands(), has32BitElementType) && |
| llvm::none_of(loopBounds, ShapedType::isDynamic); |
| |
| // Distribute workload to the given `numThreads` by allowing a potental loss. |
| auto distributeToThreads = [&](int64_t numThreads, |
| Optional<int64_t> lossFactor = llvm::None) { |
| LLVM_DEBUG(llvm::dbgs() << "\nLoss factor: " << lossFactor << "\n"); |
| initConfiguration(); |
| |
| // Scan from the innermost shape dimension and try to deduce the |
| // configuration for the corresponding GPU workgroup dimension. |
| int64_t wgDim = 0; |
| for (auto shapeDim : llvm::reverse(partitionedLoops)) { |
| int64_t loopBound = loopBounds[shapeDim]; |
| // Skip dynamic dimensions. |
| if (ShapedType::isDynamic(loopBound)) continue; |
| |
| // Try to find some power of two that can devide the current shape dim |
| // size. This vector keeps the candidate tile sizes. |
| SmallVector<int64_t, 8> candidates; |
| |
| // For the inner most workgroup dim, try to see if we can have 4 |
| // elements per thread. This enables vectorization. |
| if (vectorizable && wgDim == 0 && !lossFactor) { |
| candidates.push_back(4 * numThreads); |
| } |
| // Try all power of two numbers upto the subgroup size. |
| for (unsigned i = numThreads; i >= 1; i >>= 1) { |
| candidates.push_back(i); |
| } |
| LLVM_DEBUG({ |
| llvm::dbgs() << "Candidate tile sizes: ["; |
| llvm::interleaveComma(candidates, llvm::dbgs()); |
| llvm::dbgs() << "]\n"; |
| }); |
| |
| for (int64_t candidate : candidates) { |
| if (loopBound % candidate != 0) { |
| if (!lossFactor) continue; |
| // Skip this candidate if it causes many threads to be idle. |
| int64_t idleThreads = candidate - (loopBound % candidate); |
| if (idleThreads > candidate / *lossFactor) continue; |
| } |
| // If the workload is too small and we cannot distribute to more than 2 |
| // workgroups, try a smaller tile size to increase parallelism. |
| if (partitionedLoops.size() == 1 && candidate > subgroupSize && |
| llvm::divideCeil(loopBound, candidate) <= 2) { |
| continue; |
| } |
| |
| // Found a suitable candidate. Try to let each thread handle 4 |
| // elements if this is the workgroup x dimension. |
| workgroupTileSizes[shapeDim] = candidate; |
| LLVM_DEBUG(llvm::dbgs() << "Chosen tile size: " << candidate << "\n"); |
| if (vectorizable && wgDim == 0 && !lossFactor && candidate % 4 == 0) { |
| // Use size-1 vectors to increase parallelism if larger ones causes |
| // idle threads in the subgroup. |
| bool hasIdleThreads = |
| partitionedLoops.size() == 1 && candidate <= subgroupSize; |
| int vectorSize = hasIdleThreads ? 1 : 4; |
| LLVM_DEBUG(llvm::dbgs() << "Use vector size: " << vectorSize << "\n"); |
| threadTileSizes[shapeDim] = vectorSize; |
| workgroupSize[wgDim] = candidate / vectorSize; |
| assert(numThreads % (candidate / vectorSize) == 0); |
| numThreads /= candidate / vectorSize; |
| } else { |
| if (wgDim == 0) vectorizable = false; |
| threadTileSizes[shapeDim] = 1; |
| workgroupSize[wgDim] = candidate; |
| assert(numThreads % candidate == 0); |
| numThreads /= candidate; |
| } |
| assert(numThreads >= 1); |
| break; |
| } |
| |
| // Stop if we have distributed all threads. |
| if (numThreads == 1) break; |
| wgDim++; |
| } |
| return numThreads; |
| }; |
| |
| // First try to see if we can use up all threads without any loss. |
| if (distributeToThreads(subgroupSize) != 1) { |
| // Otherwise, allow larger and larger loss factor. |
| |
| // Threads for distribution. Use 32 at least. |
| int64_t numThreads = std::max(subgroupSize, 32); |
| // We can tolerate (1 / lossFactor) of threads in the workgroup to be idle. |
| int64_t lossFactor = 32; |
| |
| for (; lossFactor >= 1; lossFactor >>= 1) { |
| if (distributeToThreads(numThreads, lossFactor) == 1) break; |
| } |
| } |
| |
| auto pipeline = vectorizable ? CodeGenPipeline::SPIRVBaseVectorize |
| : CodeGenPipeline::SPIRVBaseDistribute; |
| |
| TileSizesListType tileSizes; |
| tileSizes.push_back(workgroupTileSizes); |
| tileSizes.push_back(threadTileSizes); |
| |
| if (vectorizable) { |
| // Try to tile all reductions by size 4 if possible. This gives us a chance |
| // to perform vector4 load if an input has its innnermost dimension being |
| // reduction. It also avoids generating too many instructions when unrolling |
| // vector later. Similarly, also try to tile other untiled parallel |
| // dimensions by 4 to avoid instruction bloat. |
| SmallVector<int64_t> loopTileSizes(linalgOp.getNumLoops(), 0); |
| for (const auto &it : llvm::enumerate(linalgOp.getIteratorTypesArray())) { |
| auto i = it.index(); |
| if (loopBounds[i] % 4 != 0) continue; |
| if (linalg::isReductionIterator(it.value()) || |
| workgroupTileSizes[i] == 0) { |
| loopTileSizes[it.index()] = 4; |
| } |
| } |
| if (llvm::any_of(loopTileSizes, [](int64_t s) { return s != 0; })) { |
| tileSizes.push_back(loopTileSizes); |
| } |
| } |
| |
| return setOpConfigAndEntryPointFnTranslation(funcOp, op, tileSizes, pipeline, |
| workgroupSize); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Configuration Dispatcher |
| //===----------------------------------------------------------------------===// |
| |
| /// Sets the CodeGen configuration as attributes to the given `rootOp` if it's a |
| /// known Linalg matmul/convolution op with good configurations. |
| static LogicalResult setSPIRVOpConfig(const spirv::TargetEnv &targetEnv, |
| func::FuncOp entryPointFn, |
| Operation *rootOp) { |
| if (IREE::Codegen::CompilationInfoAttr compilationInfo = |
| getCompilationInfo(rootOp)) { |
| // If the op already has a lowering configuration specified from the |
| // original source by the user, then use it directly. |
| return setUserConfig(entryPointFn, rootOp, compilationInfo); |
| } |
| |
| LogicalResult result = success(); |
| // First try to find a proper CodeGen configuration to tile and vectorize for |
| // the current target architecture. |
| switch (targetEnv.getVendorID()) { |
| case spirv::Vendor::AMD: |
| result = detail::setAMDCodeGenConfig(targetEnv, rootOp); |
| break; |
| case spirv::Vendor::Apple: |
| result = detail::setAppleCodeGenConfig(targetEnv, rootOp); |
| break; |
| case spirv::Vendor::ARM: |
| result = detail::setMaliCodeGenConfig(targetEnv, rootOp); |
| break; |
| case spirv::Vendor::NVIDIA: |
| result = detail::setNVIDIACodeGenConfig(targetEnv, rootOp); |
| break; |
| case spirv::Vendor::Qualcomm: |
| result = detail::setAdrenoCodeGenConfig(targetEnv, rootOp); |
| break; |
| default: |
| break; |
| } |
| |
| if (failed(result)) return result; |
| // Check whether there is actually a configuration found. If so, it's done. |
| if (getLoweringConfig(rootOp)) return result; |
| |
| // Otherwise fallback to use a default configuration that tiles and |
| // distributes/vectorizes. |
| spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits(); |
| return TypeSwitch<Operation *, LogicalResult>(rootOp) |
| .Case<linalg::BatchMatmulOp, linalg::MatmulOp>([limits](auto op) { |
| // Try to tile and vectorize first. It's common to see 32 threads |
| // per subgroup for GPUs. |
| std::array<int64_t, 2> workgroupXY = {32, 2}; |
| std::array<int64_t, 3> threadMNK; |
| auto inputType = |
| op.getInputs()[0].getType().template cast<ShapedType>(); |
| if (inputType.getElementType().getIntOrFloatBitWidth() == 16) { |
| threadMNK = {8, 8, 8}; |
| } else { |
| threadMNK = {8, 8, 4}; |
| } |
| auto result = |
| detail::setMatmulOpConfig(limits, op, workgroupXY, threadMNK); |
| if (failed(result)) return result; |
| if (getLoweringConfig(op)) return result; |
| |
| // If unsuccessful, try to tile and distribute. |
| return setDefaultOpConfig(limits, op); |
| }) |
| .Case<linalg::Conv2DNchwFchwOp, linalg::Conv2DNhwcHwcfOp, |
| linalg::DepthwiseConv2DNhwcHwcOp>([limits](auto op) { |
| // Try to tile and vectorize first. It's common to see 32 threads |
| // per subgroup for GPUs. |
| auto result = detail::setConvOpConfig(op, /*subgroupSize=*/32, |
| /*bestTilingFactor=*/32); |
| if (failed(result)) return result; |
| if (getLoweringConfig(op)) return result; |
| |
| // If unsuccessful, try to tile and distribute. |
| return setDefaultOpConfig(limits, op); |
| }) |
| .Case<linalg::ConvolutionOpInterface>([limits](auto op) { |
| // Other convolution/pooling op vectorization is not wired up. |
| return setDefaultOpConfig(limits, op, /*allowVectorization=*/false); |
| }) |
| .Case<linalg::GenericOp>([&](linalg::GenericOp op) { |
| LLVM_DEBUG(llvm::dbgs() << "figuring configuration for generic op\n"); |
| if (succeeded(setReductionConfig(targetEnv, op))) return success(); |
| |
| // If a generic op has reduction iterator types, it can be treated as a |
| // root op for configuration as well. Use the default configuration, |
| // which will mark it as a root. |
| if (op.getNumLoops() != op.getNumParallelLoops()) { |
| return setDefaultOpConfig(limits, op); |
| } |
| return success(); |
| }) |
| .Case<IREE::LinalgExt::FftOp>([limits](IREE::LinalgExt::FftOp op) { |
| return setFftOpConfig(limits, op); |
| }) |
| .Default([](Operation *) { return success(); }); |
| }; |
| |
| //===----------------------------------------------------------------------===// |
| // Entry Point |
| //===----------------------------------------------------------------------===// |
| |
| LogicalResult initSPIRVLaunchConfig(ModuleOp module) { |
| llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps = |
| getAllEntryPoints(module); |
| spirv::TargetEnvAttr targetEnvAttr = getSPIRVTargetEnvAttr(module); |
| if (!targetEnvAttr) { |
| return module.emitOpError( |
| "expected parent hal.executable.variant to have spirv.target_env " |
| "attribute"); |
| } |
| spirv::TargetEnv targetEnv(targetEnvAttr); |
| spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits(); |
| |
| for (auto funcOp : module.getOps<func::FuncOp>()) { |
| auto exportOp = exportOps.lookup(funcOp.getName()); |
| if (!exportOp) continue; |
| |
| SmallVector<Operation *> computeOps; |
| if (failed(getComputeOps(funcOp, computeOps))) { |
| return funcOp.emitOpError("failed to get compute ops"); |
| } |
| |
| if (computeOps.empty()) { |
| return funcOp.emitOpError( |
| "unhandled translation of function without compute ops"); |
| } |
| |
| Operation *rootOperation = nullptr; |
| // Try to find a configuration according to a matmul/convolution op and use |
| // it as the root op. |
| for (Operation *computeOp : computeOps) { |
| if (failed(setSPIRVOpConfig(targetEnv, funcOp, computeOp))) |
| return failure(); |
| |
| // Check if the op configuration was set. |
| if (!getLoweringConfig(computeOp)) continue; |
| |
| if (rootOperation) { |
| return computeOp->emitOpError( |
| "unhandled multiple roots in dispatch region"); |
| } |
| rootOperation = computeOp; |
| } |
| |
| if (!rootOperation) { |
| // If there are still no root op, check for any linalg.generic op. |
| Operation *computeOp = computeOps.back(); |
| if (failed(setDefaultOpConfig(limits, computeOp))) return failure(); |
| |
| // Check if the op configuration was set. |
| if (!getLoweringConfig(computeOp)) { |
| return computeOp->emitOpError( |
| "without known roots, the last compute operation in the tiled " |
| "loop body is expected to be set as root"); |
| } |
| rootOperation = computeOp; |
| } |
| } |
| return success(); |
| } |
| |
| } // namespace iree_compiler |
| } // namespace mlir |