| // Copyright 2020 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include "iree/compiler/Codegen/SPIRV/KernelConfig.h" |
| |
| #include <functional> |
| #include <numeric> |
| |
| #include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h" |
| #include "iree/compiler/Codegen/Dialect/LoweringConfig.h" |
| #include "iree/compiler/Codegen/SPIRV/Utils.h" |
| #include "iree/compiler/Codegen/Transforms/Transforms.h" |
| #include "iree/compiler/Codegen/Utils/MarkerUtils.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/MathExtras.h" |
| #include "mlir/Dialect/Linalg/IR/Linalg.h" |
| #include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h" |
| #include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h" |
| #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h" |
| #include "mlir/Dialect/Utils/StaticValueUtils.h" |
| #include "mlir/IR/BuiltinOps.h" |
| #include "mlir/IR/Matchers.h" |
| |
| #define DEBUG_TYPE "iree-spirv-kernel-config" |
| |
| namespace mlir { |
| namespace iree_compiler { |
| |
| //===----------------------------------------------------------------------===// |
| // Convolution Default Configuration |
| //===----------------------------------------------------------------------===// |
| |
| namespace detail { |
| |
| LogicalResult setConvOpConfig(linalg::LinalgOp linalgOp, |
| const int64_t subgroupSize, |
| const int64_t bestTilingFactor) { |
| ArrayRef<int64_t> inputShape = linalgOp.getInputOperand(0) |
| ->get() |
| .getType() |
| .cast<ShapedType>() |
| .getShape(); |
| ArrayRef<int64_t> outputShape = linalgOp.getOutputOperand(0) |
| ->get() |
| .getType() |
| .cast<ShapedType>() |
| .getShape(); |
| if (isa<linalg::Conv2DNhwcHwcfOp>(*linalgOp) && |
| ShapedType::isDynamic(inputShape[3])) { |
| return success(); |
| } |
| if (llvm::any_of(outputShape.drop_front(), ShapedType::isDynamic)) { |
| return success(); |
| } |
| |
| int64_t ic = inputShape[3]; |
| int64_t oh = outputShape[1], ow = outputShape[2], oc = outputShape[3]; |
| |
| // The conversion pipeline requires the input channel dimension to be some |
| // multipler of four, or less than four. |
| if (!(ic % 4 == 0 || ic < 4)) return success(); |
| |
| // The core idea is to distribute the convolution OH/OW/OC dimension to the |
| // workgroup Z/Y/X dimension, with each thread in a workgroup handling |
| // multiple vector elements. We try to 1) utilize all threads in a subgroup, |
| // and 2) handle an optimal tile size along each dimension. |
| |
| int64_t residualThreads = subgroupSize; |
| int64_t residualTilingFactor = bestTilingFactor; |
| |
| SmallVector<int64_t, 3> workgroupSize(3, 1); // (X, Y, Z) |
| SmallVector<int64_t> workgroupTileSizes(4, 0); // (N, OH, OW, OC) |
| SmallVector<int64_t> invocationTileSizes(4, 0); // (N, OH, OW, OC) |
| |
| // Deduce the configuration for the OC dimension. |
| for (int64_t x = residualThreads; x >= 2; x >>= 1) { |
| // Handle 4 elements per thread for the innermost dimension. We need this |
| // for vectorized load. |
| int64_t chosenTileSize = 4; |
| if (oc % (x * chosenTileSize) == 0) { |
| workgroupSize[0] = x; |
| workgroupTileSizes[3] = x * chosenTileSize; |
| invocationTileSizes[3] = chosenTileSize; |
| residualThreads /= x; |
| residualTilingFactor /= chosenTileSize; |
| break; |
| } |
| } |
| if (workgroupTileSizes[3] == 0) return success(); |
| |
| // Deduce the configruation for the OW and OH dimension. Try to make them even |
| // if possible given we typically have images with the same height and width. |
| bool tileToSquare = false; |
| unsigned log2Threads = llvm::Log2_64(residualThreads); |
| if (ow == oh && residualThreads != 1 && log2Threads % 2 == 0) { |
| int64_t yz = 1ll << (log2Threads / 2); |
| |
| int64_t chosenTileSize = 1ll << (llvm::Log2_64(residualTilingFactor) / 2); |
| while (chosenTileSize >= 1 && ow % (yz * chosenTileSize) != 0) { |
| chosenTileSize >>= 1; |
| } |
| |
| if (chosenTileSize != 0) { |
| workgroupSize[1] = workgroupSize[2] = yz; |
| workgroupTileSizes[2] = workgroupTileSizes[1] = yz * chosenTileSize; |
| invocationTileSizes[2] = invocationTileSizes[1] = chosenTileSize; |
| tileToSquare = true; |
| } |
| } |
| |
| // Otherwise treat OW and OH separately to allow them to have different number |
| // of threads and tiling size. |
| if (!tileToSquare) { |
| // Decide the tiling and distribution parameters for one dimension. |
| auto decideOneDim = [&](int64_t inputDim, int64_t &wgDimSize, |
| int64_t &wgTileSize, int64_t &invoTileSize) { |
| for (int64_t dim = residualThreads; dim >= 1; dim >>= 1) { |
| int64_t chosenTileSize = 0; |
| for (int64_t t = residualTilingFactor; t >= 1; t >>= 1) { |
| if (inputDim % (dim * t) == 0) { |
| chosenTileSize = t; |
| break; |
| } |
| } |
| if (chosenTileSize) { |
| wgDimSize = dim; |
| wgTileSize = dim * chosenTileSize; |
| invoTileSize = chosenTileSize; |
| residualThreads /= dim; |
| residualTilingFactor /= chosenTileSize; |
| return true; |
| } |
| } |
| return false; |
| }; |
| |
| if (!decideOneDim(ow, workgroupSize[1], workgroupTileSizes[2], |
| invocationTileSizes[2]) || |
| !decideOneDim(oh, workgroupSize[2], workgroupTileSizes[1], |
| invocationTileSizes[1])) { |
| return success(); |
| } |
| } |
| |
| auto pipeline = IREE::Codegen::DispatchLoweringPassPipeline::SPIRVVectorize; |
| TileSizesListType tileSizes; |
| tileSizes.push_back(workgroupTileSizes); |
| tileSizes.push_back(invocationTileSizes); |
| // Tiling along reduction dimensions |
| if (isa<linalg::Conv2DNhwcHwcfOp>(linalgOp)) { |
| tileSizes.push_back({0, 0, 0, 0, 1, 1, 4}); |
| } else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) { |
| tileSizes.push_back({0, 0, 0, 0, 1, 1}); |
| } else { |
| return success(); |
| } |
| |
| auto funcOp = linalgOp->getParentOfType<func::FuncOp>(); |
| return setOpConfigAndEntryPointFnTranslation(funcOp, linalgOp, tileSizes, |
| pipeline, workgroupSize); |
| } |
| |
| } // namespace detail |
| |
| //===----------------------------------------------------------------------===// |
| // Matmul Default Configuration |
| //===----------------------------------------------------------------------===// |
| |
| namespace detail { |
| |
| LogicalResult setMatmulOpConfig(linalg::LinalgOp op, int64_t subgroupSize, |
| std::array<int64_t, 2> bestWorkgroupSizeXY, |
| std::array<int64_t, 3> bestThreadTileSizeMNK, |
| bool useWorkgroupMemory) { |
| auto lhsType = op.inputs()[0].getType().cast<ShapedType>(); |
| auto elementBits = lhsType.getElementType().getIntOrFloatBitWidth(); |
| if (elementBits != 16 && elementBits != 32) return success(); |
| |
| ArrayRef<int64_t> lhsShape = |
| op.getInputOperand(0)->get().getType().cast<ShapedType>().getShape(); |
| ArrayRef<int64_t> rhsShape = |
| op.getInputOperand(1)->get().getType().cast<ShapedType>().getShape(); |
| if (llvm::any_of(lhsShape, ShapedType::isDynamic)) return success(); |
| if (llvm::any_of(rhsShape, ShapedType::isDynamic)) return success(); |
| |
| bool isBM = isa<linalg::BatchMatmulOp>(op); |
| |
| int64_t dimM = lhsShape[0 + isBM]; |
| int64_t dimK = lhsShape[1 + isBM]; |
| int64_t dimN = rhsShape[1 + isBM]; |
| |
| // The core idea is to distribute the matmul M/N dimension to the workgroup |
| // Y/X dimension, with each thread in a workgroup handling multiple vector |
| // elements. We start from the best (X, Y) and the tiling sizes for (M, N, K) |
| // and try different configurations by scaling them down until we find a |
| // configuration that can perfectly tile the input matmul. |
| |
| const int64_t bestX = bestWorkgroupSizeXY[0], bestY = bestWorkgroupSizeXY[1]; |
| const int64_t bestThreadM = bestThreadTileSizeMNK[0], |
| bestThreadN = bestThreadTileSizeMNK[1], |
| bestThreadK = bestThreadTileSizeMNK[2]; |
| |
| int64_t residualThreads = bestX * bestY; |
| int64_t residualTilingFactor = (bestThreadM + bestThreadK) * bestThreadN; |
| |
| SmallVector<int64_t, 3> workgroupSize(3, 1); // (X, Y, Z) |
| SmallVector<int64_t> workgroupTileSizes(2 + isBM, 0); // ([B,] M, N) |
| SmallVector<int64_t> invocationTileSizes(2 + isBM, 0); // ([B,] M, N) |
| SmallVector<int64_t> reductionTileSizes(3 + isBM, 0); // ([B,] M, N, K) |
| |
| if (isBM) workgroupTileSizes[0] = invocationTileSizes[0] = 1; |
| |
| // Deduce the configuration for the N dimension. Start with the best workgroup |
| // X size, and reduce by a factor of two each time. |
| for (int64_t x = bestX; x >= 2; x >>= 1) { |
| // Handle 4 elements per thread for the innermost dimension. We need this |
| // for vectorized load. |
| int64_t chosenTileSize = bestThreadN; |
| if (dimN % (x * chosenTileSize) == 0) { |
| workgroupSize[0] = x; |
| workgroupTileSizes[1 + isBM] = x * chosenTileSize; |
| invocationTileSizes[1 + isBM] = chosenTileSize; |
| residualThreads /= x; |
| assert(residualTilingFactor % chosenTileSize == 0); |
| residualTilingFactor /= chosenTileSize; |
| break; |
| } |
| } |
| if (workgroupTileSizes[1 + isBM] == 0) return success(); |
| |
| // Deduce the configuration for the M dimension. Start with the best workgroup |
| // Y size, and reduce by a factor of two each time. |
| for (int64_t y = residualThreads; y >= 1; y >>= 1) { |
| int64_t chosenTileSize = 0; |
| // Reduce the thread tiling size by one each time. We read one row each |
| // time; so it's fine to not be some power of two here. |
| for (int64_t t = bestThreadM; t >= 1; --t) { |
| if (dimM % (y * t) == 0) { |
| chosenTileSize = t; |
| break; |
| } |
| } |
| if (chosenTileSize) { |
| workgroupSize[1] = y; |
| workgroupTileSizes[0 + isBM] = y * chosenTileSize; |
| invocationTileSizes[0 + isBM] = chosenTileSize; |
| assert(residualTilingFactor > chosenTileSize); |
| residualTilingFactor -= chosenTileSize; |
| break; |
| } |
| } |
| if (workgroupTileSizes[0 + isBM] == 0) return success(); |
| |
| // Deduce the configuration for the K dimension. We need some power of two |
| // here so that we can do vector load. |
| for (int64_t t = llvm::PowerOf2Floor(residualTilingFactor); t >= 2; t >>= 1) { |
| if (dimK % t == 0) { |
| reductionTileSizes[2 + isBM] = t; |
| break; |
| } |
| } |
| if (reductionTileSizes[2 + isBM] == 0) return success(); |
| |
| auto totalThreads = |
| std::accumulate(workgroupSize.begin(), workgroupSize.end(), 1, |
| std::multiplies<int64_t>()); |
| auto pipeline = |
| (useWorkgroupMemory && totalThreads > subgroupSize) |
| ? IREE::Codegen::DispatchLoweringPassPipeline:: |
| SPIRVVectorizeWithWorkgroupMemory |
| : IREE::Codegen::DispatchLoweringPassPipeline::SPIRVVectorize; |
| |
| TileSizesListType tileSizes; |
| tileSizes.push_back(workgroupTileSizes); |
| tileSizes.push_back(invocationTileSizes); |
| tileSizes.push_back(reductionTileSizes); |
| |
| return setOpConfigAndEntryPointFnTranslation( |
| op->getParentOfType<func::FuncOp>(), op, tileSizes, pipeline, |
| workgroupSize); |
| } |
| |
| } // namespace detail |
| |
| //===----------------------------------------------------------------------===// |
| // FFT Default Configuration |
| //===----------------------------------------------------------------------===// |
| |
| static LogicalResult setFftOpConfig(spirv::ResourceLimitsAttr limits, |
| IREE::LinalgExt::FftOp op) { |
| const int64_t subgroupSize = limits.subgroup_size().getValue().getSExtValue(); |
| auto pipeline = IREE::Codegen::DispatchLoweringPassPipeline::SPIRVDistribute; |
| |
| std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1}; |
| |
| auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*op); |
| auto partitionedLoops = |
| interfaceOp.getPartitionableLoops(kNumMaxParallelDims); |
| |
| unsigned loopDepth = partitionedLoops.back() + 1; |
| SmallVector<int64_t> workgroupTileSize(loopDepth, 0); |
| |
| // Tiling along partitioned loops with size 1. |
| for (int64_t loopIndex : partitionedLoops) { |
| workgroupTileSize[loopIndex] = 1; |
| } |
| auto rank = op.getOperandRank(); |
| if (workgroupTileSize.size() >= rank && workgroupTileSize[rank - 1] != 0) { |
| APInt value; |
| if (matchPattern(op.getStage(), m_ConstantInt(&value))) { |
| workgroupTileSize[rank - 1] = 1ll << value.getSExtValue(); |
| } else { |
| op.emitError("non-constant stage might not work for fft op"); |
| return failure(); |
| } |
| } |
| TileSizesListType tileSizes = {workgroupTileSize}; |
| return setOpConfigAndEntryPointFnTranslation( |
| op->getParentOfType<func::FuncOp>(), op, tileSizes, pipeline, |
| workgroupSize); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Everything Default Configuration |
| //===----------------------------------------------------------------------===// |
| |
| static LogicalResult setDefaultOpConfig(spirv::ResourceLimitsAttr limits, |
| Operation *op) { |
| LLVM_DEBUG(llvm::dbgs() << "Using default config for op: " << *op << "\n"); |
| func::FuncOp funcOp = op->getParentOfType<func::FuncOp>(); |
| auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*op); |
| auto partitionedLoops = |
| interfaceOp.getPartitionableLoops(kNumMaxParallelDims); |
| |
| // Special case for not tiled ops. |
| if (partitionedLoops.empty()) { |
| // No tiled loops means we cannot tile (and distribute) at all. Use just one |
| // single thread to run everything. |
| auto pipeline = |
| IREE::Codegen::DispatchLoweringPassPipeline::SPIRVDistribute; |
| std::array<int64_t, 3> workgroupSize = {1, 1, 1}; |
| return setOpConfigAndEntryPointFnTranslation(funcOp, op, {}, pipeline, |
| workgroupSize); |
| } |
| |
| const int subgroupSize = limits.subgroup_size().getValue().getSExtValue(); |
| const unsigned loopDepth = partitionedLoops.back() + 1; |
| |
| // Configurations we need to decide. |
| std::array<int64_t, 3> workgroupSize; |
| SmallVector<int64_t> workgroupTileSizes; |
| SmallVector<int64_t> threadTileSizes; |
| |
| // Initialize the configuration. |
| auto initConfiguration = [&]() { |
| workgroupSize = {subgroupSize, 1, 1}; |
| workgroupTileSizes.resize(loopDepth, 0); |
| threadTileSizes.resize(loopDepth, 0); |
| |
| // Initialize tiling along all partitioned loops with size 1. |
| for (int64_t loopIndex : partitionedLoops) { |
| workgroupTileSizes[loopIndex] = threadTileSizes[loopIndex] = 1; |
| } |
| // Override the innermost dimension to distribute to threads in a subgroup. |
| workgroupTileSizes.back() = subgroupSize; |
| threadTileSizes.back() = 1; |
| }; |
| |
| // Special case for non-linalg ops. |
| auto linalgOp = dyn_cast<linalg::LinalgOp>(op); |
| if (!linalgOp || linalgOp.getNumOutputs() != 1) { |
| auto pipeline = |
| IREE::Codegen::DispatchLoweringPassPipeline::SPIRVDistribute; |
| |
| initConfiguration(); |
| TileSizesListType tileSizes; |
| tileSizes.push_back(workgroupTileSizes); |
| tileSizes.push_back(threadTileSizes); |
| |
| return setOpConfigAndEntryPointFnTranslation(funcOp, op, tileSizes, |
| pipeline, workgroupSize); |
| } |
| |
| // Common case for all linalg ops. |
| |
| // The core idea is to distribute the partitioned loops to the workgroup |
| // dimensions. The goal is to fill up the GPU as much as possible, which means |
| // 1) distributing to as many threads as possible, and 2) avoid assigning too |
| // many threads to handle out-of-bound elements (thus idle). |
| |
| // Returns true if the given `operand` has 32-bit element type. |
| auto has32BitElementType = [](Value operand) { |
| auto shapedType = operand.getType().dyn_cast<ShapedType>(); |
| Type elementType = |
| (shapedType ? shapedType.getElementType() : operand.getType()); |
| return elementType.isa<FloatType>() || elementType.isInteger(32); |
| }; |
| |
| // Whether we can try to use the vectorization pipeline. |
| Optional<SmallVector<int64_t, 4>> loopBounds = linalgOp.getStaticLoopRanges(); |
| bool vectorizable = |
| // The vectorization pipeline assumes tensor semantics when tiling. |
| !linalgOp.hasBufferSemantics() && !linalgOp.hasIndexSemantics() && |
| // Skip vectorization for non-minor identity inputs as it generates |
| // vector.transfer_read ops with permutation maps that we currently |
| // cannot lower. |
| // TODO: Remove this restriction once the lowering of the permutation |
| // map is supported in core. |
| llvm::all_of(linalgOp.getIndexingMaps(), |
| [](AffineMap &map) { return map.isMinorIdentity(); }) && |
| // TODO: Lowering of integers other than i32 may require emulation. |
| // This is currently not supported for vector operation. |
| llvm::all_of(linalgOp->getOperands(), has32BitElementType) && |
| loopBounds && llvm::none_of(loopBounds.getValue(), ShapedType::isDynamic); |
| |
| // Distribute workload to the given `numThreads` by allowing a potental loss. |
| auto distributeToThreads = [&](int64_t numThreads, |
| Optional<int64_t> lossFactor = llvm::None) { |
| LLVM_DEBUG(llvm::dbgs() << "\nLoss factor: " << lossFactor << "\n"); |
| initConfiguration(); |
| |
| // Scan from the innermost shape dimension and try to deduce the |
| // configuration for the corresponding GPU workgroup dimension. |
| int64_t wgDim = 0; |
| for (auto shapeDim : llvm::reverse(partitionedLoops)) { |
| // Skip untiled or dynamic dimensions. |
| // TODO: Skip size-1 dimensions in Flow level tiling and distribution. |
| if (loopBounds.getValue()[shapeDim] <= 0) continue; |
| |
| // Try to find some power of two that can devide the current shape dim |
| // size. This vector keeps the candidate tile sizes. |
| SmallVector<int64_t, 8> candidates; |
| |
| // For the inner most workgroup dim, try to see if we can have 4 |
| // elements per thread. This enables vectorization. |
| if (vectorizable && wgDim == 0 && !lossFactor) { |
| candidates.push_back(4 * numThreads); |
| } |
| // Try all power of two numbers upto the subgroup size. |
| for (unsigned i = numThreads; i >= 1; i >>= 1) { |
| candidates.push_back(i); |
| } |
| LLVM_DEBUG({ |
| llvm::dbgs() << "Candidates tile sizes: ["; |
| llvm::interleaveComma(candidates, llvm::dbgs()); |
| llvm::dbgs() << "]\n"; |
| }); |
| |
| for (int64_t candidate : candidates) { |
| if (loopBounds.getValue()[shapeDim] % candidate != 0) { |
| if (!lossFactor) continue; |
| // Skip this candidate if it causes many threads to be idle. |
| int64_t idleThreads = |
| candidate - (loopBounds.getValue()[shapeDim] % candidate); |
| if (idleThreads > candidate / *lossFactor) continue; |
| } |
| LLVM_DEBUG(llvm::dbgs() << "Chosen Candiate " << candidate << "\n"); |
| |
| // Found a suitable candidate. Try to let each thread handle 4 |
| // elements if this is the workgroup x dimension. |
| workgroupTileSizes[shapeDim] = candidate; |
| if (vectorizable && wgDim == 0 && !lossFactor && candidate % 4 == 0) { |
| threadTileSizes[shapeDim] = 4; |
| workgroupSize[wgDim] = candidate / 4; |
| assert(numThreads % (candidate / 4) == 0); |
| numThreads /= candidate / 4; |
| } else { |
| if (wgDim == 0) vectorizable = false; |
| threadTileSizes[shapeDim] = 1; |
| workgroupSize[wgDim] = candidate; |
| assert(numThreads % candidate == 0); |
| numThreads /= candidate; |
| } |
| assert(numThreads >= 1); |
| break; |
| } |
| |
| // Stop if we have distributed all threads. |
| if (numThreads == 1) break; |
| wgDim++; |
| } |
| return numThreads; |
| }; |
| |
| // First try to see if we can use up all threads without any loss. |
| if (distributeToThreads(subgroupSize) != 1) { |
| // Otherwise, allow larger and larger loss factor. |
| |
| // Threads for distribution Use 32 at least. |
| int64_t numThreads = std::max(subgroupSize, 32); |
| // We can tolerate (1 / lossFactor) of threads in the workgroup to be idle. |
| int64_t lossFactor = 32; |
| |
| for (; lossFactor >= 1; lossFactor >>= 1) { |
| if (distributeToThreads(numThreads, lossFactor) == 1) break; |
| } |
| } |
| |
| auto pipeline = |
| vectorizable |
| ? IREE::Codegen::DispatchLoweringPassPipeline::SPIRVVectorize |
| : IREE::Codegen::DispatchLoweringPassPipeline::SPIRVDistribute; |
| |
| TileSizesListType tileSizes; |
| tileSizes.push_back(workgroupTileSizes); |
| tileSizes.push_back(threadTileSizes); |
| |
| return setOpConfigAndEntryPointFnTranslation(funcOp, op, tileSizes, pipeline, |
| workgroupSize); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Configuration Dispatcher |
| //===----------------------------------------------------------------------===// |
| |
| /// Sets the CodeGen configuration as attributes to the given `rootOp` if it's a |
| /// known Linalg matmul/convolution op with good configurations. |
| static LogicalResult setSPIRVOpConfig(const spirv::TargetEnv &targetEnv, |
| Operation *rootOp) { |
| LogicalResult result = success(); |
| // First try to find a proper CodeGen configuration to tile and vectorize for |
| // the current target architecture. |
| switch (targetEnv.getVendorID()) { |
| case spirv::Vendor::AMD: |
| result = detail::setAMDCodeGenConfig(targetEnv, rootOp); |
| break; |
| case spirv::Vendor::ARM: |
| result = detail::setMaliCodeGenConfig(targetEnv, rootOp); |
| break; |
| case spirv::Vendor::NVIDIA: |
| result = detail::setNVIDIACodeGenConfig(targetEnv, rootOp); |
| break; |
| case spirv::Vendor::Qualcomm: |
| result = detail::setAdrenoCodeGenConfig(targetEnv, rootOp); |
| break; |
| default: |
| break; |
| } |
| |
| if (failed(result)) return result; |
| // Check whether there is actually a configuration found. If so, it's done. |
| if (getLoweringConfig(rootOp)) return result; |
| |
| // Otherwise fallback to use a default configuration that tiles and |
| // distributes/vectorizes. |
| spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits(); |
| return TypeSwitch<Operation *, LogicalResult>(rootOp) |
| .Case<linalg::BatchMatmulOp, linalg::MatmulOp>([limits](auto op) { |
| // Try to tile and vectorize first. It's common to see 32 threads |
| // per subgroup for GPUs. |
| std::array<int64_t, 2> workgroupXY = {32, 2}; |
| std::array<int64_t, 3> threadMNK = {8, 8, 4}; |
| auto result = detail::setMatmulOpConfig(op, /*subgroupSize=*/32, |
| workgroupXY, threadMNK); |
| if (failed(result)) return result; |
| if (getLoweringConfig(op)) return result; |
| |
| // If unsuccessful, try to tile and distribute. |
| return setDefaultOpConfig(limits, op); |
| }) |
| .Case<linalg::Conv2DNhwcHwcfOp, linalg::DepthwiseConv2DNhwcHwcOp>( |
| [limits](auto op) { |
| // Try to tile and vectorize first. It's common to see 32 threads |
| // per subgroup for GPUs. |
| auto result = detail::setConvOpConfig(op, /*subgroupSize=*/32, |
| /*bestTilingFactor=*/32); |
| if (failed(result)) return result; |
| if (getLoweringConfig(op)) return result; |
| |
| // If unsuccessful, try to tile and distribute. |
| return setDefaultOpConfig(limits, op); |
| }) |
| .Case<IREE::LinalgExt::FftOp>([limits](IREE::LinalgExt::FftOp op) { |
| return setFftOpConfig(limits, op); |
| }) |
| .Case<linalg::GenericOp>([limits](linalg::GenericOp op) { |
| // If a generic op has reduction iterator types, it can be treated as a |
| // root op for configuration as well. Use the default configuration, |
| // which will mark it as a root. |
| if (op.getNumLoops() != op.getNumParallelLoops()) { |
| return setDefaultOpConfig(limits, op); |
| } |
| return success(); |
| }) |
| .Default([](Operation *) { return success(); }); |
| }; |
| |
| //===----------------------------------------------------------------------===// |
| // Entry Point |
| //===----------------------------------------------------------------------===// |
| |
| LogicalResult initSPIRVLaunchConfig(ModuleOp module) { |
| llvm::StringMap<IREE::HAL::ExecutableEntryPointOp> entryPointOps = |
| getAllEntryPoints(module); |
| spirv::TargetEnvAttr targetEnvAttr = getSPIRVTargetEnvAttr(module); |
| if (!targetEnvAttr) { |
| return module.emitOpError( |
| "expected parent hal.executable.variant to have spv.target_env " |
| "attribute"); |
| } |
| spirv::TargetEnv targetEnv(targetEnvAttr); |
| spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits(); |
| |
| for (auto funcOp : module.getOps<func::FuncOp>()) { |
| auto entryPointOp = entryPointOps.lookup(funcOp.getName()); |
| if (!entryPointOp) continue; |
| |
| SmallVector<Operation *> computeOps; |
| SmallVector<LoopTilingAndDistributionInfo> tiledLoops; |
| if (failed(getComputeOps(funcOp, computeOps, tiledLoops))) { |
| return funcOp.emitOpError("failed to get compute ops"); |
| } |
| |
| if (computeOps.empty()) { |
| return funcOp.emitOpError( |
| "unhandled translation of function without compute ops"); |
| } |
| |
| Operation *rootOperation = nullptr; |
| // Try to find a configuration according to a matmul/convolution op and use |
| // it as the root op. |
| for (Operation *computeOp : computeOps) { |
| if (failed(setSPIRVOpConfig(targetEnv, computeOp))) return failure(); |
| |
| // Check if the op configuration was set. |
| if (!getLoweringConfig(computeOp)) continue; |
| |
| if (rootOperation) { |
| return computeOp->emitOpError( |
| "unhandled multiple roots in dispatch region"); |
| } |
| rootOperation = computeOp; |
| } |
| |
| if (!rootOperation) { |
| // If there are still no root op, check for any linalg.generic op. |
| Operation *computeOp = computeOps.back(); |
| if (failed(setDefaultOpConfig(limits, computeOp))) return failure(); |
| |
| // Check if the op configuration was set. |
| if (!getLoweringConfig(computeOp)) { |
| return computeOp->emitOpError( |
| "without known roots, the last compute operation in the tiled " |
| "loop body is expected to be set as root"); |
| } |
| rootOperation = computeOp; |
| } |
| |
| // Propogate the `lowering_config` attribute to the other ops. |
| // TODO(ravishankarm, antiagainst): This is a very specific use (and |
| // fragile). In general, this should not be needed. Things are already tiled |
| // and distributed. The rest of the compilation must be structured to either |
| // use `TileAndFuse` or they are independent configurations that are |
| // determined based on the op. |
| IREE::Codegen::LoweringConfigAttr config = getLoweringConfig(rootOperation); |
| for (auto op : computeOps) { |
| if (op == rootOperation) continue; |
| setLoweringConfig(op, config); |
| } |
| } |
| return success(); |
| } |
| |
| } // namespace iree_compiler |
| } // namespace mlir |