| // Copyright 2020 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| //===- ConvertToGPUPass.cpp -----------------------------------------------===// |
| // |
| // Partition computation within dispatch function to workgroups/workitems. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include <array> |
| |
| #include "iree/compiler/Conversion/LinalgToSPIRV/Attributes.h" |
| #include "iree/compiler/Conversion/LinalgToSPIRV/MarkerUtils.h" |
| #include "iree/compiler/Conversion/LinalgToSPIRV/Passes.h" |
| #include "iree/compiler/Conversion/LinalgToSPIRV/Utils.h" |
| #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" |
| #include "mlir/Dialect/Affine/IR/AffineOps.h" |
| #include "mlir/Dialect/GPU/GPUDialect.h" |
| #include "mlir/Dialect/Linalg/IR/LinalgOps.h" |
| #include "mlir/Dialect/Linalg/Transforms/Transforms.h" |
| #include "mlir/Dialect/SCF/SCF.h" |
| #include "mlir/Dialect/SPIRV/TargetAndABI.h" |
| #include "mlir/Dialect/StandardOps/IR/Ops.h" |
| #include "mlir/IR/AffineMap.h" |
| #include "mlir/IR/FunctionSupport.h" |
| #include "mlir/Support/LLVM.h" |
| #include "mlir/Transforms/DialectConversion.h" |
| #include "mlir/Transforms/LoopUtils.h" |
| |
| namespace mlir { |
| namespace iree_compiler { |
| |
| static llvm::cl::opt<bool> isWorkgroupCountConstrained( |
| "iree-codegen-constrained-workgroup-count", |
| llvm::cl::desc("Specify whether the number of workgroups can be assumed to " |
| "be large enough to cover the entire workload"), |
| llvm::cl::init(false)); |
| |
| // TODO(#2134): Remove this flag/set it to false always when issue with |
| // convolution is resolved (see bug for more details). |
| // TODO(#2346): Make this a pass specific option. |
| llvm::cl::opt<bool> useLegacyConvLowering{ |
| "iree-codegen-use-legacy-conv-lowering", |
| llvm::cl::desc("Use conv lowering that does not assume 1:1 mapping " |
| "between threads within a block and iterations of " |
| "parallel loops distributed to the block"), |
| llvm::cl::init(true)}; |
| |
| //===----------------------------------------------------------------------===// |
| // Loop utilities |
| //===----------------------------------------------------------------------===// |
| |
| /// Builds an empty scf.for operation. The default builder adds an entry basic |
| /// block which needs to be avoided here. |
| static scf::ForOp buildEmptyForOp(Location loc, OpBuilder &builder, Value lb, |
| Value ub, Value step) { |
| OperationState state(loc, scf::ForOp::getOperationName()); |
| state.addOperands({lb, ub, step}); |
| state.addRegion(); |
| return cast<scf::ForOp>(builder.createOperation(state)); |
| } |
| |
| /// Builds an empty scf.if operation without the then and else blocks. |
| static scf::IfOp buildEmptyIfOp(Location loc, OpBuilder &builder, Value cond) { |
| OperationState state(loc, scf::IfOp::getOperationName()); |
| state.addOperands(cond); |
| state.addRegion(); |
| state.addRegion(); |
| return cast<scf::IfOp>(builder.createOperation(state)); |
| } |
| |
| namespace { |
| struct LoopBounds { |
| Value lb; |
| Value ub; |
| Value step; |
| }; |
| } // namespace |
| |
| /// Replaces a scf.parallelOp with an optional scf.parallel op and nested |
| /// scf.for operations. To create the scf.parallel op as the outermost loop, |
| /// pass the lower bound, upper bound and steps in `newPLoopLbs`, `newPLoopUbs`, |
| /// and `newPLoopStep` respectively. The bounds of the inner scf.for operations |
| /// to be created are passed in `forLbs`, `forUbs`, and `forStep`. The |
| /// `permutation` vector contains a mapping from the original loop order, to the |
| /// loop order to be generated. |
| static Operation *replacePLoopOp(ConversionPatternRewriter &rewriter, |
| scf::ParallelOp pLoopOp, |
| ArrayRef<LoopBounds> newPLoopBounds, |
| ArrayRef<LoopBounds> forBounds, |
| ArrayRef<unsigned> permutation) { |
| assert(!forBounds.empty() && "unhandled case of no scf.for created"); |
| unsigned numLoops = pLoopOp.getNumLoops(); |
| Location loc = pLoopOp.getLoc(); |
| assert(forBounds.size() + newPLoopBounds.size() == numLoops && |
| "cannot drop loops when splitting scf.parallel operation"); |
| assert(permutation.size() == numLoops); |
| OpBuilder::InsertionGuard guard(rewriter); |
| |
| // Need a signature conversion for the body of the scf.parallel operation, |
| // before can it can be used as the body of the innermost loop created here. |
| TypeConverter::SignatureConversion signatureConverter(numLoops); |
| Operation *outermostLoop = nullptr; |
| auto permuteIt = permutation.begin(); |
| |
| // Create the scf.parallel operation as the outermost loop, if specified. |
| if (!newPLoopBounds.empty()) { |
| auto lbs = llvm::to_vector<2>(llvm::map_range( |
| newPLoopBounds, [](LoopBounds bounds) -> Value { return bounds.lb; })); |
| auto ubs = llvm::to_vector<2>(llvm::map_range( |
| newPLoopBounds, [](LoopBounds bounds) { return bounds.ub; })); |
| auto steps = llvm::to_vector<2>(llvm::map_range( |
| newPLoopBounds, [](LoopBounds bounds) { return bounds.step; })); |
| auto newPLoop = rewriter.create<scf::ParallelOp>(loc, lbs, ubs, steps); |
| for (auto iv : newPLoop.getInductionVars()) { |
| signatureConverter.remapInput(*permuteIt, iv); |
| permuteIt++; |
| } |
| rewriter.setInsertionPointToStart(newPLoop.getBody()); |
| outermostLoop = newPLoop.getOperation(); |
| } |
| |
| // Generate the nested scf.for operations with the bounds passed. |
| for (auto it : enumerate(forBounds)) { |
| Value lb = it.value().lb, ub = it.value().ub, step = it.value().step; |
| if (it.index() != forBounds.size() - 1) { |
| auto forOp = rewriter.create<scf::ForOp>(loc, lb, ub, step); |
| if (!outermostLoop) outermostLoop = forOp.getOperation(); |
| signatureConverter.remapInput(*permuteIt, forOp.getInductionVar()); |
| rewriter.setInsertionPointToStart(forOp.getBody()); |
| } else { |
| // For the last loop, move the body of the scf.parallel op as the body of |
| // the loop after signature conversion. |
| auto forOp = buildEmptyForOp(loc, rewriter, lb, ub, step); |
| if (!outermostLoop) outermostLoop = forOp.getOperation(); |
| signatureConverter.addInputs(*permuteIt, rewriter.getIndexType()); |
| Region &pLoopOpRegion = pLoopOp.getLoopBody(); |
| rewriter.applySignatureConversion(&pLoopOpRegion, signatureConverter); |
| Region &forOpRegion = forOp.getLoopBody(); |
| rewriter.inlineRegionBefore(pLoopOpRegion, forOpRegion, |
| forOpRegion.begin()); |
| } |
| permuteIt++; |
| } |
| rewriter.eraseOp(pLoopOp); |
| return outermostLoop; |
| } |
| |
| /// Serializes the dimensions of the scf.parallel specified in |
| /// `serializedDimensions`, by creating an nested scf.for operation for each |
| /// dimension. |
| // TODO(ravishankarm): Move this into LoopUtils.h in MLIR. |
| static Operation *serializeDimensions(ConversionPatternRewriter &rewriter, |
| scf::ParallelOp pLoopOp, |
| ArrayRef<unsigned> serializedDimensions) { |
| assert(!serializedDimensions.empty() && |
| "unhandled corner case of no serializing dims"); |
| OpBuilder::InsertionGuard guard(rewriter); |
| DenseSet<unsigned> serializedDimSet; |
| serializedDimSet.insert(serializedDimensions.begin(), |
| serializedDimensions.end()); |
| assert(serializedDimSet.size() == serializedDimensions.size() && |
| "cannot repeat dimensions during serialization of scf.parallel"); |
| SmallVector<LoopBounds, 2> newPLoopBounds, forBounds; |
| SmallVector<unsigned, 2> permutation; |
| auto lbs = pLoopOp.lowerBound(); |
| auto ubs = pLoopOp.upperBound(); |
| auto steps = pLoopOp.step(); |
| for (unsigned i : llvm::seq<unsigned>(0, pLoopOp.getNumLoops())) { |
| if (serializedDimSet.count(i)) { |
| forBounds.push_back({lbs[i], ubs[i], steps[i]}); |
| } else { |
| newPLoopBounds.push_back({lbs[i], ubs[i], steps[i]}); |
| permutation.push_back(i); |
| } |
| } |
| permutation.append(serializedDimensions.begin(), serializedDimensions.end()); |
| return replacePLoopOp(rewriter, pLoopOp, newPLoopBounds, forBounds, |
| permutation); |
| } |
| |
| /// Serialize all inner dimensions of a `pLoopOp` starting from `serializeFrom`. |
| static Operation *serializeDimensionsFrom(ConversionPatternRewriter &rewriter, |
| scf::ParallelOp pLoopOp, |
| unsigned serializeFrom) { |
| unsigned numLoops = pLoopOp.getNumLoops(); |
| assert(serializeFrom > 0 && "unhandled serializaing all dimensions"); |
| assert(serializeFrom < numLoops && |
| "unhandled corner case of no serialization"); |
| SmallVector<unsigned, 2> serializedDimensions; |
| for (unsigned dim : llvm::seq(serializeFrom, numLoops)) |
| serializedDimensions.push_back(dim); |
| return serializeDimensions(rewriter, pLoopOp, serializedDimensions); |
| } |
| |
| /// Collapses all loops in a scf.parallel into one scf.parallel operation. This |
| /// is done by |
| /// 1) Normalize the loop bounds to be [0, (ub - lb) / step) |
| /// 2) Compute the total number of iterations. |
| /// 3) From the induction variable of the modified loop, compute the values of |
| /// the original induction variables by de-linearization. |
| scf::ParallelOp collapseParallelLoops(ConversionPatternRewriter &rewriter, |
| scf::ParallelOp pLoopOp) { |
| if (pLoopOp.getNumReductions()) return nullptr; |
| |
| unsigned numLoops = pLoopOp.getNumLoops(); |
| if (numLoops == 1) return pLoopOp; |
| |
| // Compute the number of iterations of each loops starting from the innermost. |
| Location loc = pLoopOp.getLoc(); |
| Value totalNumIterations = rewriter.create<ConstantIndexOp>(loc, 1); |
| |
| // Track the "stride" of each loop, i.e. product of the total number of |
| // iterations of the inner loops. |
| SmallVector<Value, 2> iterationStride; |
| iterationStride.resize(pLoopOp.getNumLoops()); |
| auto lbs = pLoopOp.lowerBound(); |
| auto ubs = pLoopOp.upperBound(); |
| auto steps = pLoopOp.step(); |
| for (int i = numLoops - 1; i >= 0; --i) { |
| Value lb = lbs[i], ub = ubs[i], step = steps[i]; |
| Value iterCount = rewriter.create<SignedDivIOp>( |
| loc, rewriter.create<SubIOp>(loc, ub, lb), step); |
| iterationStride[i] = totalNumIterations; |
| totalNumIterations = |
| rewriter.create<MulIOp>(loc, totalNumIterations, iterCount); |
| } |
| |
| // Create the collapsed parallel loop op with lowerbound 0, step 1 and upper |
| // bound being the totalNumIterations. |
| Value newLb = rewriter.create<ConstantIndexOp>(loc, 0); |
| Value newStep = rewriter.create<ConstantIndexOp>(loc, 1); |
| scf::ParallelOp newPLoopOp = |
| rewriter.create<scf::ParallelOp>(loc, newLb, totalNumIterations, newStep); |
| |
| // Build the body of the collapsed loop by cloning the original loop body. The |
| // replacement value of the induction variables of the original loop body, |
| // from the induction variable of the new loop, using |
| // origLoopIv[i] = loopIv / iterationStride[i] |
| // loopIv = loopIv % iterationStride[i] |
| OpBuilder::InsertionGuard guard(rewriter); |
| Block &pLoopBody = pLoopOp.getLoopBody().front(); |
| rewriter.setInsertionPointToStart(&newPLoopOp.getLoopBody().front()); |
| Value loopIv = *newPLoopOp.getInductionVars().begin(); |
| BlockAndValueMapping map; |
| for (int i : llvm::seq<int>(0, numLoops)) { |
| Value iterNum = |
| rewriter.create<SignedDivIOp>(loc, loopIv, iterationStride[i]); |
| Value newIv = rewriter.create<AddIOp>( |
| loc, lbs[i], rewriter.create<MulIOp>(loc, iterNum, steps[i])); |
| map.map(pLoopBody.getArgument(i), newIv); |
| loopIv = rewriter.create<SignedRemIOp>(loc, loopIv, iterationStride[i]); |
| } |
| for (Operation &op : pLoopBody.without_terminator()) { |
| rewriter.clone(op, map); |
| } |
| rewriter.eraseOp(pLoopOp); |
| return newPLoopOp; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // GPU processor ID mapping utilities |
| //===----------------------------------------------------------------------===// |
| |
| /// Distributes scf.parallel to processors with the processors logically |
| /// arranged with same dimensionality as the number of loops, i.e. a |
| /// scf.parallel with 2 loops to a 2D grid of processors. `processorIDs` and |
| /// `numProcessors` must be of same size as the number of loops and are the |
| /// values to use for process ID and number of processors along each dimension |
| /// in the distributed code. |
| /// This method accounts for the case where the number of processors is not |
| /// enough to execute the entire iteration space with one iteration mapped to |
| /// each processor. So implements a cyclic distribution of iterations to |
| /// processors. |
| static LogicalResult distributeCyclicallyToProcessors( |
| ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp, |
| ArrayRef<Value> processorIDs, ArrayRef<Value> numProcessors) { |
| unsigned numLoops = pLoopOp.getNumLoops(); |
| assert(numLoops == processorIDs.size() && |
| "expected as many ids as number of loops"); |
| assert(numLoops == numProcessors.size() && |
| "expected as many nprocs as number of loops"); |
| SmallVector<LoopBounds, 2> forBounds; |
| SmallVector<unsigned, 2> permutation; |
| forBounds.reserve(numLoops); |
| permutation.reserve(numLoops); |
| Location loc = pLoopOp.getLoc(); |
| auto lbs = pLoopOp.lowerBound(), ubs = pLoopOp.upperBound(), |
| steps = pLoopOp.step(); |
| for (unsigned i : llvm::seq<unsigned>(0, processorIDs.size())) { |
| Value mappedLb = rewriter.create<AddIOp>( |
| loc, lbs[i], rewriter.create<MulIOp>(loc, steps[i], processorIDs[i])); |
| Value mappedStep = rewriter.create<MulIOp>(loc, steps[i], numProcessors[i]); |
| forBounds.push_back({mappedLb, ubs[i], mappedStep}); |
| permutation.push_back(i); |
| } |
| replacePLoopOp(rewriter, pLoopOp, /*newPLoopBounds=*/{}, forBounds, |
| permutation); |
| return success(); |
| } |
| |
| /// Distributes scf.parallel to processors with the processors logically |
| /// arranged with same dimensionality as the number of loops, i.e. a |
| /// scf.parallel with 2 loops to a 2D grid of processors. `processorIDs` must be |
| /// of same size as the number of loops and are the values to use for process ID |
| /// and number of processors along each dimension in the distributed code. This |
| /// method assumes that the number of processors is greater than or equal to the |
| /// number of iterations. So just generates an if statement to mask of |
| /// processors with no work. When the number of processors is known to be |
| /// exactly equal to the number of iterations, the if statement is not needed as |
| /// well. In such cases, `generateGuard` can be set to `false` to avoid |
| /// generating the if statement. |
| static LogicalResult distributeSingleIterationPerProcessor( |
| ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp, |
| ArrayRef<Value> processorIDs, bool generateGuard = false) { |
| unsigned numLoops = pLoopOp.getNumLoops(); |
| Location loc = pLoopOp.getLoc(); |
| assert(numLoops == processorIDs.size() && |
| "expected as many ids as number of loops"); |
| |
| auto lbs = pLoopOp.lowerBound(); |
| auto step = pLoopOp.step(); |
| SmallVector<Value, 2> ivReplacements; |
| for (unsigned i : llvm::seq<unsigned>(0, numLoops)) { |
| Value iterValue = rewriter.create<AddIOp>( |
| loc, lbs[i], rewriter.create<MulIOp>(loc, processorIDs[i], step[i])); |
| ivReplacements.push_back(iterValue); |
| } |
| Region &pLoopOpRegion = pLoopOp.getLoopBody(); |
| |
| if (generateGuard) { |
| TypeConverter::SignatureConversion signatureConverter(numLoops); |
| Value cond = nullptr; |
| auto ubs = pLoopOp.upperBound(); |
| for (unsigned i : llvm::seq<unsigned>(0, numLoops)) { |
| Value cmp = rewriter.create<CmpIOp>(loc, CmpIPredicate::slt, |
| ivReplacements[i], ubs[i]); |
| cond = (cond ? rewriter.create<AndOp>(loc, cond, cmp) : cmp); |
| signatureConverter.remapInput(i, ivReplacements[i]); |
| } |
| rewriter.applySignatureConversion(&pLoopOpRegion, signatureConverter); |
| scf::IfOp ifOp = buildEmptyIfOp(loc, rewriter, cond); |
| Region &ifOpRegion = ifOp.getRegion(0); |
| rewriter.inlineRegionBefore(pLoopOpRegion, ifOpRegion, ifOpRegion.begin()); |
| } else { |
| // The body of the scf.parallel needs to be moved into its parent |
| // operation. |
| // - Split the block just before the scf.parallel operation. |
| // - Move the only block of scf.parallel before the newly created block |
| // (after signature conversion). |
| // - Add branch from the original block to the moved block of the |
| // scf.parallel's region, and from the latter to the block created by the |
| // split operation. |
| // - Canonicalization will fold these branches away. |
| Block *destBlock = pLoopOp.getOperation()->getBlock(); |
| Block *remainingInst = |
| rewriter.splitBlock(destBlock, Block::iterator(pLoopOp)); |
| Block *sourceBlock = &pLoopOpRegion.front(); |
| rewriter.eraseOp(sourceBlock->getTerminator()); |
| rewriter.mergeBlocks(&pLoopOpRegion.front(), destBlock, ivReplacements); |
| rewriter.mergeBlocks(remainingInst, destBlock, {}); |
| } |
| rewriter.eraseOp(pLoopOp); |
| return success(); |
| } |
| |
| namespace { |
| struct ProcessorIdAndCount { |
| Value id; |
| Value count; |
| }; |
| |
| /// These are class declarations that are only used for template |
| /// specialization. They wont be needed if GPU dialect has ops for global |
| /// invocation ID directly. |
| class GPUGlobalId; |
| class GPUGlobalCount; |
| } // namespace |
| |
| template <typename GPUIdOp, typename GPUCountOp> |
| static ProcessorIdAndCount getGPUProcessorIdAndCount( |
| Location loc, StringRef dim, ConversionPatternRewriter &rewriter) { |
| Type indexType = rewriter.getIndexType(); |
| return { |
| rewriter.create<GPUIdOp>(loc, indexType, rewriter.getStringAttr(dim)), |
| rewriter.create<GPUCountOp>(loc, indexType, rewriter.getStringAttr(dim))}; |
| } |
| |
| template <> |
| ProcessorIdAndCount getGPUProcessorIdAndCount<GPUGlobalId, GPUGlobalCount>( |
| Location loc, StringRef dim, ConversionPatternRewriter &rewriter) { |
| Type indexType = rewriter.getIndexType(); |
| Value gridDim = rewriter.create<gpu::GridDimOp>(loc, indexType, |
| rewriter.getStringAttr(dim)); |
| Value blockId = rewriter.create<gpu::BlockIdOp>(loc, indexType, |
| rewriter.getStringAttr(dim)); |
| Value blockDim = rewriter.create<gpu::BlockDimOp>( |
| loc, indexType, rewriter.getStringAttr(dim)); |
| Value threadId = rewriter.create<gpu::ThreadIdOp>( |
| loc, indexType, rewriter.getStringAttr(dim)); |
| return {rewriter.create<AddIOp>( |
| loc, rewriter.create<MulIOp>(loc, blockId, blockDim), threadId), |
| rewriter.create<MulIOp>(loc, blockDim, gridDim)}; |
| } |
| |
| template <typename GPUIdOp, typename GPUCountOp> |
| static void getGPUProcessorIdsAndCounts(Location loc, |
| ConversionPatternRewriter &rewriter, |
| unsigned numDims, |
| MutableArrayRef<Value> id, |
| MutableArrayRef<Value> count) { |
| std::array<StringRef, 3> dims{"x", "y", "z"}; |
| assert(id.size() == numDims); |
| assert(count.size() == numDims); |
| for (unsigned i = 0; i < numDims; ++i) { |
| ProcessorIdAndCount idAndCount = |
| getGPUProcessorIdAndCount<GPUIdOp, GPUCountOp>(loc, dims[i], rewriter); |
| id[numDims - 1 - i] = idAndCount.id; |
| count[numDims - 1 - i] = idAndCount.count; |
| } |
| } |
| |
| /// Distributes scf.parallel to processors where `IdOp` is used to get the |
| /// processor ID and `DimOp` is used to get the number of processors along a |
| /// dimension. |
| template <typename GPUIdOp, typename GPUCountOp> |
| static LogicalResult distributeCyclicallyToProcessors( |
| ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp) { |
| unsigned numLoops = pLoopOp.getNumLoops(); |
| if (numLoops > 3) { |
| pLoopOp = |
| cast<scf::ParallelOp>(serializeDimensionsFrom(rewriter, pLoopOp, 3)); |
| numLoops = 3; |
| } |
| SmallVector<Value, 2> id(numLoops), count(numLoops); |
| getGPUProcessorIdsAndCounts<GPUIdOp, GPUCountOp>(pLoopOp.getLoc(), rewriter, |
| numLoops, id, count); |
| return distributeCyclicallyToProcessors(rewriter, pLoopOp, id, count); |
| } |
| |
| /// Distributes scf.parallel to processors where `IdOp` is used to get the |
| /// processor ID and `DimOp` is used to get the number of processors along a |
| /// dimension. Assumes that the number of processors will be less than equal to |
| /// the number of iterations of the pLoopOp along all dimensions. |
| template <typename GPUIdOp, typename GPUCountOp> |
| static LogicalResult distributeSingleIterationPerProcessor( |
| ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp, |
| bool generateGuard = true) { |
| unsigned numLoops = pLoopOp.getNumLoops(); |
| if (numLoops > 3) { |
| pLoopOp = |
| cast<scf::ParallelOp>(serializeDimensionsFrom(rewriter, pLoopOp, 3)); |
| numLoops = 3; |
| } |
| SmallVector<Value, 2> id(numLoops), count(numLoops); |
| getGPUProcessorIdsAndCounts<GPUIdOp, GPUCountOp>(pLoopOp.getLoc(), rewriter, |
| numLoops, id, count); |
| return distributeSingleIterationPerProcessor(rewriter, pLoopOp, id, |
| generateGuard); |
| } |
| |
| /// Distribute the scf.parallel to workgroups. |
| static LogicalResult mapToWorkgroups(ConversionPatternRewriter &rewriter, |
| scf::ParallelOp pLoopOp, |
| bool useCyclicDistribution = false) { |
| if (useCyclicDistribution) |
| return distributeCyclicallyToProcessors<gpu::BlockIdOp, gpu::GridDimOp>( |
| rewriter, pLoopOp); |
| return distributeSingleIterationPerProcessor<gpu::BlockIdOp, gpu::GridDimOp>( |
| rewriter, pLoopOp, false); |
| } |
| |
| /// Distributes scf.parallel to workitems using local invocation ID. |
| static LogicalResult mapToLocalInvocationId(ConversionPatternRewriter &rewriter, |
| scf::ParallelOp pLoopOp) { |
| return distributeSingleIterationPerProcessor<gpu::ThreadIdOp, |
| gpu::BlockDimOp>(rewriter, |
| pLoopOp); |
| } |
| |
| /// Distributes scf.parallel to workitems using global invocation ID. The GPU |
| /// dialect doesn't have a direct operation to do this. This could be done using |
| /// id = blockIdx * blockDim + gridIdx. count = blockDim * gridDim. |
| static LogicalResult mapToGlobalInvocationId( |
| ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp) { |
| return distributeSingleIterationPerProcessor<GPUGlobalId, GPUGlobalCount>( |
| rewriter, pLoopOp); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Pass and patterns. |
| //===----------------------------------------------------------------------===// |
| |
| /// In some cases the iterations of the loops when partitioned to workgroups |
| /// need to be distributed in a cyclic manner. The main use cases here is when |
| /// the number of workgroups is constrained such that the number of iterations |
| /// is greater than equal to number of processors (along any dimension). In |
| /// those cases, distribute the iterations in a cyclic manner. This adds |
| /// additional control flow, but isn't too detrimental to performance since they |
| /// are convergent for the most part. |
| // TODO(#2134): Mapping iterations to processors directly by assuming number of |
| // iterations <= number of processors again seems to have an issue with |
| // convolution/pooling. Needs further investigation. |
| static bool useCyclicLoopDistribution(scf::ParallelOp pLoopOp) { |
| if (!useLegacyConvLowering) return false; |
| auto walkResult = pLoopOp.walk([](Operation *op) -> WalkResult { |
| if (isa<linalg::ConvOp>(op) || isa<linalg::PoolingMaxOp>(op) || |
| isa<linalg::PoolingMinOp>(op) || isa<linalg::PoolingSumOp>(op)) |
| return WalkResult::interrupt(); |
| return WalkResult::advance(); |
| }); |
| return walkResult.wasInterrupted(); |
| } |
| |
| namespace { |
| /// Pass to convert from tiled and fused linalg ops into gpu.func. |
| struct ConvertToGPUPass : public PassWrapper<ConvertToGPUPass, FunctionPass> { |
| ConvertToGPUPass() = default; |
| ConvertToGPUPass(const ConvertToGPUPass &pass) {} |
| |
| void runOnFunction() override; |
| }; |
| |
| /// Pattern to map scf.parallel to workgroups. |
| struct PartitionPLoopToWorkgroups |
| : public OpConversionPattern<scf::ParallelOp> { |
| using OpConversionPattern<scf::ParallelOp>::OpConversionPattern; |
| LogicalResult matchAndRewrite( |
| scf::ParallelOp pLoopOp, ArrayRef<Value> operands, |
| ConversionPatternRewriter &rewriter) const override { |
| return mapToWorkgroups( |
| rewriter, pLoopOp, |
| isWorkgroupCountConstrained || useCyclicLoopDistribution(pLoopOp)); |
| } |
| }; |
| |
| /// Map tiled linalg op to workitems by lowering it to scf.parallel and |
| /// partitioning it to workitems. |
| template <typename LinalgOpTy> |
| struct MapLinalgOpToLocalInvocationId : public OpConversionPattern<LinalgOpTy> { |
| using OpConversionPattern<LinalgOpTy>::OpConversionPattern; |
| LogicalResult matchAndRewrite( |
| LinalgOpTy linalgOp, ArrayRef<Value> operands, |
| ConversionPatternRewriter &rewriter) const override { |
| // Check for marker that specifies that the linalg op is to be partitioned |
| // across threads within a workgroup. |
| if (!hasWorkGroupMarker(linalgOp)) return failure(); |
| Optional<linalg::LinalgLoops> loops = |
| linalg::linalgLowerOpToLoops<scf::ParallelOp>(rewriter, linalgOp); |
| if (!loops) return failure(); |
| if (!loops.getValue().empty()) { |
| scf::ParallelOp pLoopOp = dyn_cast<scf::ParallelOp>(loops.getValue()[0]); |
| if (!pLoopOp || failed(mapToLocalInvocationId(rewriter, pLoopOp))) |
| return failure(); |
| } |
| rewriter.eraseOp(linalgOp); |
| return success(); |
| } |
| }; |
| |
| /// Legacy path for lowering tiled conv/pooling op to loops. |
| // TODO(#2134): Remove this pattern. The default path of using |
| // `MapLinalgOpToLocalInvocationId` seems to have a bug. It only shows up |
| // currently on Resnet50. Remove this pattern after the bug is triaged/fixed. |
| template <typename LinalgOpTy> |
| struct MapConvPoolToLocalInvocationId : public OpConversionPattern<LinalgOpTy> { |
| using OpConversionPattern<LinalgOpTy>::OpConversionPattern; |
| LogicalResult matchAndRewrite( |
| LinalgOpTy linalgOp, ArrayRef<Value> operands, |
| ConversionPatternRewriter &rewriter) const override { |
| if (!hasWorkGroupMarker(linalgOp)) return failure(); |
| Optional<linalg::LinalgLoops> loops = |
| linalg::linalgLowerOpToLoops<scf::ParallelOp>(rewriter, linalgOp); |
| if (!loops) return failure(); |
| scf::ParallelOp pLoopOp = cast<scf::ParallelOp>(loops.getValue()[0]); |
| if (failed( |
| distributeCyclicallyToProcessors<gpu::ThreadIdOp, gpu::BlockDimOp>( |
| rewriter, pLoopOp))) |
| return failure(); |
| rewriter.eraseOp(linalgOp); |
| return success(); |
| } |
| }; |
| |
| /// Map linalg operation to execute on GPU in parallel by mapping the parallel |
| /// loops to "GlobalInvocationId". |
| template <typename LinalgOpTy> |
| struct MapLinalgOpToGlobalInvocationId |
| : public OpConversionPattern<LinalgOpTy> { |
| using OpConversionPattern<LinalgOpTy>::OpConversionPattern; |
| LogicalResult matchAndRewrite( |
| LinalgOpTy linalgOp, ArrayRef<Value> operands, |
| ConversionPatternRewriter &rewriter) const override { |
| // If marker exists do nothing. |
| if (hasMarker(linalgOp)) return failure(); |
| FuncOp funcOp = linalgOp.template getParentOfType<FuncOp>(); |
| if (!funcOp) return failure(); |
| Optional<linalg::LinalgLoops> loops = |
| linalg::linalgLowerOpToLoops<scf::ParallelOp>(rewriter, linalgOp); |
| if (!loops) return failure(); |
| |
| SmallVector<int64_t, 3> workgroupSize(3, 1); |
| if (!loops.getValue().empty()) { |
| scf::ParallelOp pLoopOp = dyn_cast<scf::ParallelOp>(loops.getValue()[0]); |
| // If there are parallel loops partition them to threads using global |
| // invocation ID. |
| if (pLoopOp) { |
| pLoopOp = collapseParallelLoops(rewriter, pLoopOp); |
| if (!pLoopOp) return failure(); |
| if (failed(mapToGlobalInvocationId(rewriter, pLoopOp))) |
| return rewriter.notifyMatchFailure( |
| linalgOp, "mapping to GlobalInvocationID failed"); |
| workgroupSize = {32, 1, 1}; |
| } |
| } |
| rewriter.eraseOp(linalgOp); |
| if (failed(updateWorkGroupSize(funcOp, workgroupSize))) return failure(); |
| funcOp.setAttr(getWorkgroupCountAttrName(), |
| rewriter.getI32IntegerAttr(static_cast<int32_t>( |
| WorkgroupCountMethodology::LinearizeResultShape))); |
| return success(); |
| } |
| }; |
| |
| /// Remove the linalg.range operation created when lowering to loops. |
| struct RemoveLinalgRange : public OpConversionPattern<linalg::RangeOp> { |
| using OpConversionPattern<linalg::RangeOp>::OpConversionPattern; |
| LogicalResult matchAndRewrite( |
| linalg::RangeOp rangeOp, ArrayRef<Value> operands, |
| ConversionPatternRewriter &rewriter) const override { |
| if (!rangeOp.getResult().use_empty()) return failure(); |
| rewriter.eraseOp(rangeOp); |
| return success(); |
| } |
| }; |
| } // namespace |
| |
| void populateParallelLoopToWorkgroupPatterns( |
| MLIRContext *context, OwningRewritePatternList &patterns) { |
| patterns.insert<PartitionPLoopToWorkgroups>(context); |
| } |
| |
| void ConvertToGPUPass::runOnFunction() { |
| FuncOp funcOp = getFunction(); |
| |
| Region &body = funcOp.getBody(); |
| if (!llvm::hasSingleElement(body)) { |
| funcOp.emitError("unhandled dispatch function with multiple blocks"); |
| return signalPassFailure(); |
| } |
| |
| MLIRContext *context = &getContext(); |
| ConversionTarget target(*context); |
| // After this pass Linalg and scf.parallel ops should be gone. |
| target.addIllegalOp<scf::ParallelOp>(); |
| target.addIllegalDialect<linalg::LinalgDialect>(); |
| // Reshape ops are treated legal since they just change the way the underlying |
| // buffer is viewed. These are legalized downstream. They become no ops when |
| // lowering to SPIR-V since the SPIR-V code uses linearized arrays. |
| target.addLegalOp<linalg::ReshapeOp>(); |
| // Let the rest fall through. |
| target.markUnknownOpDynamicallyLegal([](Operation *) { return true; }); |
| |
| OwningRewritePatternList patterns; |
| |
| // clang-format off |
| patterns.insert< |
| |
| #define ADD_ALL_LINALG_PATTERNS(OP_NAME) \ |
| MapLinalgOpToGlobalInvocationId<OP_NAME>, \ |
| MapLinalgOpToLocalInvocationId<OP_NAME> |
| |
| ADD_ALL_LINALG_PATTERNS(linalg::CopyOp), |
| ADD_ALL_LINALG_PATTERNS(linalg::FillOp), |
| ADD_ALL_LINALG_PATTERNS(linalg::GenericOp), |
| ADD_ALL_LINALG_PATTERNS(linalg::IndexedGenericOp), |
| |
| #undef ADD_ALL_LINALG_PATTERNS |
| |
| #define ADD_ALL_CONV_POOL_PATTERNS(OP_NAME) \ |
| MapConvPoolToLocalInvocationId<OP_NAME>, \ |
| MapLinalgOpToGlobalInvocationId<OP_NAME> |
| |
| ADD_ALL_CONV_POOL_PATTERNS(linalg::PoolingMaxOp), |
| ADD_ALL_CONV_POOL_PATTERNS(linalg::PoolingMinOp), |
| ADD_ALL_CONV_POOL_PATTERNS(linalg::PoolingSumOp), |
| |
| #undef ADD_ALL_CONV_POOL_PATTERNS |
| |
| MapLinalgOpToLocalInvocationId<linalg::MatmulOp>, |
| PartitionPLoopToWorkgroups, RemoveLinalgRange>(context); |
| // clang-format on |
| |
| patterns.insert<MapLinalgOpToGlobalInvocationId<linalg::ConvOp>>(context); |
| if (useLegacyConvLowering) |
| patterns.insert<MapConvPoolToLocalInvocationId<linalg::ConvOp>>(context); |
| else |
| patterns.insert<MapLinalgOpToLocalInvocationId<linalg::ConvOp>>(context); |
| |
| if (failed(applyFullConversion(funcOp, target, patterns))) |
| return signalPassFailure(); |
| } |
| |
| std::unique_ptr<OperationPass<FuncOp>> createConvertToGPUPass() { |
| return std::make_unique<ConvertToGPUPass>(); |
| } |
| |
| static PassRegistration<ConvertToGPUPass> pass( |
| "iree-codegen-convert-to-gpu", "Map tiled linalg and loop ops to GPU", |
| [] { return std::make_unique<ConvertToGPUPass>(); }); |
| |
| } // namespace iree_compiler |
| } // namespace mlir |