iree/compiler/Conversion/LinalgToSPIRV/ConvertToGPUPass.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2020 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 //===- ConvertToGPUPass.cpp -----------------------------------------------===//
 //
 // Partition computation within dispatch function to workgroups/workitems.
 //
 //===----------------------------------------------------------------------===//

 #include <array>
 #include <numeric>

 #include "iree/compiler/Conversion/CodegenUtils/FunctionUtils.h"
 #include "iree/compiler/Conversion/CodegenUtils/MarkerUtils.h"
 #include "iree/compiler/Conversion/Common/Attributes.h"
 #include "iree/compiler/Conversion/Common/Transforms.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/KernelDispatchUtils.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/MemorySpace.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/Passes.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/Utils.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "iree/compiler/Dialect/Shape/IR/ShapeDialect.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/FunctionSupport.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/LoopUtils.h"

 namespace mlir {
 namespace iree_compiler {

 //===----------------------------------------------------------------------===//
 // Loop utilities
 //===----------------------------------------------------------------------===//

 /// Builds an empty scf.for operation. The default builder adds an entry basic
 /// block which needs to be avoided here.
 static scf::ForOp buildEmptyForOp(Location loc, OpBuilder &builder, Value lb,
                                   Value ub, Value step) {
   OperationState state(loc, scf::ForOp::getOperationName());
   state.addOperands({lb, ub, step});
   state.addRegion();
   return cast<scf::ForOp>(builder.createOperation(state));
 }

 /// Builds an empty scf.if operation without the then and else blocks.
 static scf::IfOp buildEmptyIfOp(Location loc, OpBuilder &builder, Value cond) {
   OperationState state(loc, scf::IfOp::getOperationName());
   state.addOperands(cond);
   state.addRegion();
   state.addRegion();
   return cast<scf::IfOp>(builder.createOperation(state));
 }

 namespace {
 struct LoopBounds {
   Value lb;
   Value ub;
   Value step;
 };
 }  // namespace

 /// Replaces a scf.parallelOp with an optional scf.parallel op and nested
 /// scf.for operations. To create the scf.parallel op as the outermost loop,
 /// pass the lower bound, upper bound and steps in `newPLoopLbs`, `newPLoopUbs`,
 /// and `newPLoopStep` respectively. The bounds of the inner scf.for operations
 /// to be created are passed in `forLbs`, `forUbs`, and `forStep`. The
 /// `permutation` vector contains a mapping from the original loop order, to the
 /// loop order to be generated.
 static Operation *replacePLoopOp(ConversionPatternRewriter &rewriter,
                                  scf::ParallelOp pLoopOp,
                                  ArrayRef<LoopBounds> newPLoopBounds,
                                  ArrayRef<LoopBounds> forBounds,
                                  ArrayRef<unsigned> permutation) {
   assert(!forBounds.empty() && "unhandled case of no scf.for created");
   unsigned numLoops = pLoopOp.getNumLoops();
   Location loc = pLoopOp.getLoc();
   assert(forBounds.size() + newPLoopBounds.size() == numLoops &&
          "cannot drop loops when splitting scf.parallel operation");
   assert(permutation.size() == numLoops);
   OpBuilder::InsertionGuard guard(rewriter);

   // Need a signature conversion for the body of the scf.parallel operation,
   // before can it can be used as the body of the innermost loop created here.
   TypeConverter::SignatureConversion signatureConverter(numLoops);
   Operation *outermostLoop = nullptr;
   auto permuteIt = permutation.begin();

   // Create the scf.parallel operation as the outermost loop, if specified.
   if (!newPLoopBounds.empty()) {
     auto lbs = llvm::to_vector<2>(llvm::map_range(
         newPLoopBounds, [](LoopBounds bounds) -> Value { return bounds.lb; }));
     auto ubs = llvm::to_vector<2>(llvm::map_range(
         newPLoopBounds, [](LoopBounds bounds) { return bounds.ub; }));
     auto steps = llvm::to_vector<2>(llvm::map_range(
         newPLoopBounds, [](LoopBounds bounds) { return bounds.step; }));
     auto newPLoop = rewriter.create<scf::ParallelOp>(loc, lbs, ubs, steps);
     for (auto iv : newPLoop.getInductionVars()) {
       signatureConverter.remapInput(*permuteIt, iv);
       permuteIt++;
     }
     rewriter.setInsertionPointToStart(newPLoop.getBody());
     outermostLoop = newPLoop.getOperation();
   }

   // Generate the nested scf.for operations with the bounds passed.
   for (auto it : enumerate(forBounds)) {
     Value lb = it.value().lb, ub = it.value().ub, step = it.value().step;
     if (it.index() != forBounds.size() - 1) {
       auto forOp = rewriter.create<scf::ForOp>(loc, lb, ub, step);
       if (!outermostLoop) outermostLoop = forOp.getOperation();
       signatureConverter.remapInput(*permuteIt, forOp.getInductionVar());
       rewriter.setInsertionPointToStart(forOp.getBody());
     } else {
       // For the last loop, move the body of the scf.parallel op as the body of
       // the loop after signature conversion.
       auto forOp = buildEmptyForOp(loc, rewriter, lb, ub, step);
       if (!outermostLoop) outermostLoop = forOp.getOperation();
       signatureConverter.addInputs(*permuteIt, rewriter.getIndexType());
       Region &pLoopOpRegion = pLoopOp.getLoopBody();
       rewriter.applySignatureConversion(&pLoopOpRegion, signatureConverter);
       Region &forOpRegion = forOp.getLoopBody();
       rewriter.inlineRegionBefore(pLoopOpRegion, forOpRegion,
                                   forOpRegion.begin());
     }
     permuteIt++;
   }
   rewriter.eraseOp(pLoopOp);
   return outermostLoop;
 }

 /// Serializes the dimensions of the scf.parallel specified in
 /// `serializedDimensions`, by creating an nested scf.for operation for each
 /// dimension.
 // TODO(ravishankarm): Move this into LoopUtils.h in MLIR.
 static Operation *serializeDimensions(ConversionPatternRewriter &rewriter,
                                       scf::ParallelOp pLoopOp,
                                       ArrayRef<unsigned> serializedDimensions) {
   assert(!serializedDimensions.empty() &&
          "unhandled corner case of no serializing dims");
   OpBuilder::InsertionGuard guard(rewriter);
   DenseSet<unsigned> serializedDimSet;
   serializedDimSet.insert(serializedDimensions.begin(),
                           serializedDimensions.end());
   assert(serializedDimSet.size() == serializedDimensions.size() &&
          "cannot repeat dimensions during serialization of scf.parallel");
   SmallVector<LoopBounds, 2> newPLoopBounds, forBounds;
   SmallVector<unsigned, 2> permutation;
   auto lbs = pLoopOp.lowerBound();
   auto ubs = pLoopOp.upperBound();
   auto steps = pLoopOp.step();
   for (unsigned i : llvm::seq<unsigned>(0, pLoopOp.getNumLoops())) {
     if (serializedDimSet.count(i)) {
       forBounds.push_back({lbs[i], ubs[i], steps[i]});
     } else {
       newPLoopBounds.push_back({lbs[i], ubs[i], steps[i]});
       permutation.push_back(i);
     }
   }
   permutation.append(serializedDimensions.begin(), serializedDimensions.end());
   return replacePLoopOp(rewriter, pLoopOp, newPLoopBounds, forBounds,
                         permutation);
 }

 /// Serialize all inner dimensions of a `pLoopOp` starting from `serializeFrom`.
 static Operation *serializeDimensionsFrom(ConversionPatternRewriter &rewriter,
                                           scf::ParallelOp pLoopOp,
                                           unsigned serializeFrom) {
   unsigned numLoops = pLoopOp.getNumLoops();
   assert(serializeFrom < numLoops &&
          "unhandled corner case of no serialization");
   SmallVector<unsigned, 2> serializedDimensions;
   for (unsigned dim : llvm::seq(serializeFrom, numLoops))
     serializedDimensions.push_back(dim);
   return serializeDimensions(rewriter, pLoopOp, serializedDimensions);
 }

 /// Collapses all loops in a scf.parallel into one scf.parallel operation. This
 /// is done by
 /// 1) Normalize the loop bounds to be [0, (ub - lb) / step)
 /// 2) Compute the total number of iterations.
 /// 3) From the induction variable of the modified loop, compute the values of
 ///    the original induction variables by de-linearization.
 scf::ParallelOp collapseParallelLoops(ConversionPatternRewriter &rewriter,
                                       scf::ParallelOp pLoopOp) {
   if (pLoopOp.getNumReductions()) return nullptr;

   unsigned numLoops = pLoopOp.getNumLoops();
   if (numLoops == 1) return pLoopOp;

   // Compute the number of iterations of each loops starting from the innermost.
   Location loc = pLoopOp.getLoc();
   Value totalNumIterations = rewriter.create<ConstantIndexOp>(loc, 1);

   // Track the "stride" of each loop, i.e. product of the total number of
   // iterations of the inner loops.
   SmallVector<Value, 2> iterationStride;
   iterationStride.resize(pLoopOp.getNumLoops());
   auto lbs = pLoopOp.lowerBound();
   auto ubs = pLoopOp.upperBound();
   auto steps = pLoopOp.step();
   for (int i = numLoops - 1; i >= 0; --i) {
     Value lb = lbs[i], ub = ubs[i], step = steps[i];
     Value iterCount = rewriter.create<SignedDivIOp>(
         loc, rewriter.create<SubIOp>(loc, ub, lb), step);
     iterationStride[i] = totalNumIterations;
     totalNumIterations =
         rewriter.create<MulIOp>(loc, totalNumIterations, iterCount);
   }

   // Create the collapsed parallel loop op with lowerbound 0, step 1 and upper
   // bound being the totalNumIterations.
   Value newLb = rewriter.create<ConstantIndexOp>(loc, 0);
   Value newStep = rewriter.create<ConstantIndexOp>(loc, 1);
   scf::ParallelOp newPLoopOp =
       rewriter.create<scf::ParallelOp>(loc, newLb, totalNumIterations, newStep);

   // Build the body of the collapsed loop by cloning the original loop body. The
   // replacement value of the induction variables of the original loop body,
   // from the induction variable of the new loop, using
   //   origLoopIv[i] = loopIv / iterationStride[i]
   //   loopIv = loopIv % iterationStride[i]
   OpBuilder::InsertionGuard guard(rewriter);
   Block &pLoopBody = pLoopOp.getLoopBody().front();
   rewriter.setInsertionPointToStart(&newPLoopOp.getLoopBody().front());
   Value loopIv = *newPLoopOp.getInductionVars().begin();
   BlockAndValueMapping map;
   edsc::ScopedContext scope(rewriter, loc);
   using namespace edsc::op;
   for (int i : llvm::seq<int>(0, numLoops)) {
     Value iterNum =
         rewriter.create<SignedDivIOp>(loc, loopIv, iterationStride[i]);
     Value newIv = lbs[i] + (iterNum * steps[i]);
     map.map(pLoopBody.getArgument(i), newIv);
     loopIv = rewriter.create<SignedRemIOp>(loc, loopIv, iterationStride[i]);
   }
   for (Operation &op : pLoopBody.without_terminator()) {
     rewriter.clone(op, map);
   }
   rewriter.eraseOp(pLoopOp);
   return newPLoopOp;
 }

 //===----------------------------------------------------------------------===//
 // GPU processor ID mapping utilities
 //===----------------------------------------------------------------------===//

 /// Distributes scf.parallel to processors with the processors logically
 /// arranged with same dimensionality as the number of loops, i.e. a
 /// scf.parallel with 2 loops to a 2D grid of processors. `processorIDs` and
 /// `numProcessors` must be of same size as the number of loops and are the
 /// values to use for process ID and number of processors along each dimension
 /// in the distributed code.
 /// This method accounts for the case where the number of processors is not
 /// enough to execute the entire iteration space with one iteration mapped to
 /// each processor. So implements a cyclic distribution of iterations to
 /// processors.
 static LogicalResult distributeCyclicallyToProcessors(
     ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp,
     ArrayRef<linalg::ProcInfo> procInfo) {
   unsigned numLoops = pLoopOp.getNumLoops();
   assert(numLoops == procInfo.size() &&
          "expected as many ids as number of loops");
   SmallVector<LoopBounds, 2> forBounds;
   SmallVector<unsigned, 2> permutation;
   forBounds.reserve(numLoops);
   permutation.reserve(numLoops);
   Location loc = pLoopOp.getLoc();
   auto lbs = pLoopOp.lowerBound(), ubs = pLoopOp.upperBound(),
        steps = pLoopOp.step();
   for (unsigned i : llvm::seq<unsigned>(0, procInfo.size())) {
     Value mappedLb = rewriter.create<AddIOp>(
         loc, lbs[i],
         rewriter.create<MulIOp>(loc, steps[i], procInfo[i].procId));
     Value mappedStep =
         rewriter.create<MulIOp>(loc, steps[i], procInfo[i].nprocs);
     forBounds.push_back({mappedLb, ubs[i], mappedStep});
     permutation.push_back(i);
   }
   replacePLoopOp(rewriter, pLoopOp, /*newPLoopBounds=*/{}, forBounds,
                  permutation);
   return success();
 }

 /// Distributes scf.parallel to processors with the processors logically
 /// arranged with same dimensionality as the number of loops, i.e. a
 /// scf.parallel with 2 loops to a 2D grid of processors. `processorIDs` must be
 /// of same size as the number of loops and are the values to use for process ID
 /// and number of processors along each dimension in the distributed code.  This
 /// method assumes that the number of processors is greater than or equal to the
 /// number of iterations. So just generates an if statement to mask of
 /// processors with no work. When the number of processors is known to be
 /// exactly equal to the number of iterations, the if statement is not needed as
 /// well. In such cases, `generateGuard` can be set to `false` to avoid
 /// generating the if statement.
 static LogicalResult distributeSingleIterationPerProcessor(
     ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp,
     ArrayRef<linalg::ProcInfo> procInfo, bool generateGuard = false) {
   unsigned numLoops = pLoopOp.getNumLoops();
   Location loc = pLoopOp.getLoc();
   assert(numLoops == procInfo.size() &&
          "expected as many ids as number of loops");

   auto lbs = pLoopOp.lowerBound();
   auto step = pLoopOp.step();
   SmallVector<Value, 2> ivReplacements;
   for (unsigned i : llvm::seq<unsigned>(0, numLoops)) {
     Value iterValue = rewriter.create<AddIOp>(
         loc, lbs[i], rewriter.create<MulIOp>(loc, procInfo[i].procId, step[i]));
     ivReplacements.push_back(iterValue);
   }
   Region &pLoopOpRegion = pLoopOp.getLoopBody();

   if (generateGuard) {
     TypeConverter::SignatureConversion signatureConverter(numLoops);
     Value cond = nullptr;
     auto ubs = pLoopOp.upperBound();
     for (unsigned i : llvm::seq<unsigned>(0, numLoops)) {
       Value cmp = rewriter.create<CmpIOp>(loc, CmpIPredicate::slt,
                                           ivReplacements[i], ubs[i]);
       cond = (cond ? rewriter.create<AndOp>(loc, cond, cmp) : cmp);
       signatureConverter.remapInput(i, ivReplacements[i]);
     }
     rewriter.applySignatureConversion(&pLoopOpRegion, signatureConverter);
     scf::IfOp ifOp = buildEmptyIfOp(loc, rewriter, cond);
     Region &ifOpRegion = ifOp.getRegion(0);
     rewriter.inlineRegionBefore(pLoopOpRegion, ifOpRegion, ifOpRegion.begin());
   } else {
     // The body of the scf.parallel needs to be moved into its parent
     // operation.
     // - Split the block just before the scf.parallel operation.
     // - Move the only block of scf.parallel before the newly created block
     //   (after signature conversion).
     // - Add branch from the original block to the moved block of the
     //   scf.parallel's region, and from the latter to the block created by the
     //   split operation.
     // - Canonicalization will fold these branches away.
     Block *destBlock = pLoopOp.getOperation()->getBlock();
     Block *remainingInst =
         rewriter.splitBlock(destBlock, Block::iterator(pLoopOp));
     Block *sourceBlock = &pLoopOpRegion.front();
     rewriter.eraseOp(sourceBlock->getTerminator());
     rewriter.mergeBlocks(&pLoopOpRegion.front(), destBlock, ivReplacements);
     rewriter.mergeBlocks(remainingInst, destBlock, {});
   }
   rewriter.eraseOp(pLoopOp);
   return success();
 }

 template <typename GPUIdOp, typename GPUCountOp>
 static linalg::ProcInfo getLinearizedGPUProcessorIdAndCount(
     Location loc, ConversionPatternRewriter &rewriter) {
   SmallVector<linalg::ProcInfo, 3> procInfo =
       getGPUProcessorIdsAndCounts<GPUIdOp, GPUCountOp>(rewriter, loc,
                                                        kNumGPUDims);
   linalg::ProcInfo linearized;
   linearized.procId = procInfo[0].procId;
   linearized.nprocs = procInfo[0].nprocs;
   for (unsigned i = 0; i < kNumGPUDims - 1; ++i) {
     linearized.procId =
         rewriter.create<MulIOp>(loc, linearized.procId, procInfo[i + 1].nprocs);
     linearized.procId =
         rewriter.create<AddIOp>(loc, linearized.procId, procInfo[i + 1].procId);
     linearized.nprocs =
         rewriter.create<MulIOp>(loc, linearized.nprocs, procInfo[i + 1].nprocs);
   }
   return linearized;
 }

 /// Distributes scf.parallel to processors where `IdOp` is used to get the
 /// processor ID and `DimOp` is used to get the number of processors along a
 /// dimension.
 template <typename GPUIdOp, typename GPUCountOp>
 static LogicalResult distributeCyclicallyToProcessors(
     ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp) {
   unsigned numLoops = pLoopOp.getNumLoops();
   if (numLoops > 3) {
     pLoopOp =
         cast<scf::ParallelOp>(serializeDimensionsFrom(rewriter, pLoopOp, 3));
     numLoops = 3;
   }
   SmallVector<linalg::ProcInfo, 2> procInfo =
       getGPUProcessorIdsAndCounts<GPUIdOp, GPUCountOp>(
           rewriter, pLoopOp.getLoc(), numLoops);
   return distributeCyclicallyToProcessors(rewriter, pLoopOp, procInfo);
 }

 /// Distributes scf.parallel to processors where `IdOp` is used to get the
 /// processor ID and `DimOp` is used to get the number of processors along a
 /// dimension. Assumes that the number of processors will be less than equal to
 /// the number of iterations of the pLoopOp along all dimensions.
 template <typename GPUIdOp, typename GPUCountOp>
 static LogicalResult distributeSingleIterationPerProcessor(
     ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp,
     bool generateGuard = true) {
   unsigned numLoops = pLoopOp.getNumLoops();
   if (numLoops > 3) {
     pLoopOp =
         cast<scf::ParallelOp>(serializeDimensionsFrom(rewriter, pLoopOp, 3));
     numLoops = 3;
   }
   auto procInfo = getGPUProcessorIdsAndCounts<GPUIdOp, GPUCountOp>(
       rewriter, pLoopOp.getLoc(), numLoops);
   return distributeSingleIterationPerProcessor(rewriter, pLoopOp, procInfo,
                                                generateGuard);
 }

 /// Distribute the scf.parallel to workgroups.
 static LogicalResult mapToWorkgroups(ConversionPatternRewriter &rewriter,
                                      scf::ParallelOp pLoopOp,
                                      bool useCyclicDistribution = false) {
   if (useCyclicDistribution) {
     return distributeCyclicallyToProcessors<gpu::BlockIdOp, gpu::GridDimOp>(
         rewriter, pLoopOp);
   }
   return distributeSingleIterationPerProcessor<gpu::BlockIdOp, gpu::GridDimOp>(
       rewriter, pLoopOp, false);
 }

 /// Distributes scf.parallel to workitems using local invocation ID.
 static LogicalResult mapToLocalInvocationId(
     ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp,
     bool useCyclicDistribution = false) {
   if (useCyclicDistribution) {
     return distributeCyclicallyToProcessors<gpu::ThreadIdOp, gpu::BlockDimOp>(
         rewriter, pLoopOp);
   }
   return distributeSingleIterationPerProcessor<gpu::ThreadIdOp,
                                                gpu::BlockDimOp>(rewriter,
                                                                 pLoopOp);
 }

 /// Distributes scf.parallel to workitems using global invocation ID. The GPU
 /// dialect doesn't have a direct operation to do this. This could be done using
 /// id = blockIdx * blockDim + gridIdx. count = blockDim * gridDim.
 static LogicalResult mapToGlobalInvocationId(
     ConversionPatternRewriter &rewriter, scf::ParallelOp pLoopOp) {
   return distributeSingleIterationPerProcessor<GPUGlobalId, GPUGlobalCount>(
       rewriter, pLoopOp);
 }

 /// Returns the number of bytes copied when loading to/storing from workgorup
 /// memory. It is approximated to be the size of the underlying allocation being
 /// copied into/from.
 static Optional<int64_t> getLinearizedCopySize(linalg::CopyOp copyOp) {
   Value src = copyOp.input();
   Value dst = copyOp.output();
   MemRefType srcType = src.getType().cast<MemRefType>();
   MemRefType dstType = dst.getType().cast<MemRefType>();

   Value workgroupMemoryView;
   MemRefType workgroupMemoryType;
   if (srcType.getMemorySpace() == getWorkgroupMemorySpace()) {
     workgroupMemoryView = src;
     workgroupMemoryType = srcType;
   } else if (dstType.getMemorySpace() == getWorkgroupMemorySpace()) {
     workgroupMemoryView = dst;
     workgroupMemoryType = dstType;
   } else {
     return {};
   }

   SubViewOp workgroupMemorySubviewOp =
       dyn_cast_or_null<SubViewOp>(workgroupMemoryView.getDefiningOp());
   if (!workgroupMemorySubviewOp) return {};
   AllocOp allocOp = dyn_cast_or_null<AllocOp>(
       workgroupMemorySubviewOp.source().getDefiningOp());
   if (!allocOp) return {};

   MemRefType allocOpType = allocOp.getType();
   if (!allocOpType.hasStaticShape()) return {};
   return allocOpType.getNumElements();
 }

 //===----------------------------------------------------------------------===//
 // Pass and patterns.
 //===----------------------------------------------------------------------===//

 namespace {
 /// Pass to convert from tiled and fused linalg ops into gpu.func.
 class ConvertToGPUPass
     : public PassWrapper<ConvertToGPUPass,
                          OperationPass<IREE::HAL::ExecutableTargetOp>> {
  public:
   ConvertToGPUPass(const SPIRVCodegenOptions &passOptions)
       : options(passOptions) {}
   ConvertToGPUPass(const ConvertToGPUPass &pass) : options(pass.options) {}

   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<AffineDialect, gpu::GPUDialect, scf::SCFDialect,
                     ShapeDialect>();
   }
   void runOnOperation() override;

  private:
   SPIRVCodegenOptions options;
 };

 struct SerializeParallelLoopPattern
     : public OpConversionPattern<scf::ParallelOp> {
   using OpConversionPattern<scf::ParallelOp>::OpConversionPattern;
   LogicalResult matchAndRewrite(
       scf::ParallelOp pLoopOp, ArrayRef<Value> operands,
       ConversionPatternRewriter &rewriter) const override {
     return success(serializeDimensionsFrom(rewriter, pLoopOp, 0) != nullptr);
   }
 };

 /// Implementation of the mapping of tiled linalg op to workitems within a
 /// workgroup.
 template <typename LinalgOpTy>
 static LogicalResult mapLinalgOpToLocalInvocationIdImpl(
     LinalgOpTy linalgOp, ArrayRef<Value> operands,
     ConversionPatternRewriter &rewriter, bool optimizeControlFlow) {
   // Check for marker that specifies that the linalg op is to be partitioned
   // across threads within a workgroup.
   if (!hasMarker(linalgOp)) return failure();
   Optional<linalg::LinalgLoops> loops =
       linalg::linalgLowerOpToLoops<scf::ParallelOp>(rewriter, linalgOp);
   if (!loops) return failure();
   if (loops.getValue().empty()) return success();

   auto pLoopOp = cast<scf::ParallelOp>(loops.getValue()[0]);
   return mapToLocalInvocationId(rewriter, pLoopOp, optimizeControlFlow);
 }

 static LogicalResult distributeCopyOp(linalg::CopyOp copyOp,
                                       scf::ParallelOp pLoopOp,
                                       ConversionPatternRewriter &rewriter) {
   pLoopOp = collapseParallelLoops(rewriter, pLoopOp);
   if (!pLoopOp) return failure();

   Optional<int64_t> copyLength = getLinearizedCopySize(copyOp);
   linalg::ProcInfo idAndCount =
       getLinearizedGPUProcessorIdAndCount<gpu::ThreadIdOp, gpu::BlockDimOp>(
           copyOp.getLoc(), rewriter);
   auto workgroupSize =
       spirv::lookupLocalWorkGroupSize(copyOp).getValues<APInt>();
   int64_t linearizedWorkgroupSize = std::accumulate(
       workgroupSize.begin(), workgroupSize.end(), 1,
       [](int64_t total, APInt value) { return total * value.getSExtValue(); });

   if (copyLength.hasValue() && !workgroupSize.empty() &&
       copyLength.getValue() <= linearizedWorkgroupSize) {
     return distributeSingleIterationPerProcessor(rewriter, pLoopOp, idAndCount,
                                                  /*generateGuard=*/true);
   }
   return distributeCyclicallyToProcessors(rewriter, pLoopOp, idAndCount);
 }

 /// CopyOp that are loading to/storing from workgroup memory are special cased
 /// to use all workitems to do a copy. This is done by linearizing the copy
 /// operation.
 // TODO(ravishankarm): This linearization is achieved through collapsing the
 // generated parallel loops from a multi-dimensional copy. Such lowering results
 // in mods/divs in the collapsed loop body. This can be removed by reshaping the
 // copy to be a 1D copy. This seems to be hitting an error in reshape
 // canonicalization. Investigate this further.
 template <>
 LogicalResult mapLinalgOpToLocalInvocationIdImpl<linalg::CopyOp>(
     linalg::CopyOp copyOp, ArrayRef<Value> operands,
     ConversionPatternRewriter &rewriter, bool optimizeControlFlow) {
   if (!hasMarker(copyOp,
                  {getCopyToWorkgroupMemoryMarker(), getWorkgroupMarker()}))
     return failure();
   Optional<linalg::LinalgLoops> loops =
       linalg::linalgLowerOpToLoops<scf::ParallelOp>(rewriter, copyOp);
   if (!loops) return failure();
   if (loops.getValue().empty()) return success();

   auto pLoopOp = cast<scf::ParallelOp>(loops.getValue()[0]);
   if (hasMarker(copyOp, getWorkgroupMarker())) {
     return mapToLocalInvocationId(rewriter, pLoopOp, optimizeControlFlow);
   }
   return distributeCopyOp(copyOp, pLoopOp, rewriter);
 }

 /// Map tiled linalg op to workitems by lowering it to scf.parallel and
 /// partitioning it to workitems.
 template <typename LinalgOpTy>
 struct MapLinalgOpToLocalInvocationId : public OpConversionPattern<LinalgOpTy> {
   MapLinalgOpToLocalInvocationId(MLIRContext *context,
                                  bool usingLinalgOnTensorsPath,
                                  PatternBenefit benefit = 1)
       : OpConversionPattern<LinalgOpTy>(context, benefit),
         usingLinalgOnTensorsPath(usingLinalgOnTensorsPath) {}
   LogicalResult matchAndRewrite(
       LinalgOpTy linalgOp, ArrayRef<Value> operands,
       ConversionPatternRewriter &rewriter) const override {
     if (failed(mapLinalgOpToLocalInvocationIdImpl(linalgOp, operands, rewriter,
                                                   usingLinalgOnTensorsPath)))
       return failure();

     // If the `linalgOp` writes to workgroup memory insert barrier after the
     // op.
     if (llvm::any_of(linalgOp.getOperands(), [](Value output) {
           MemRefType outputType = output.getType().dyn_cast<MemRefType>();
           return outputType &&
                  outputType.getMemorySpace() == getWorkgroupMemorySpace();
         })) {
       rewriter.create<spirv::ControlBarrierOp>(
           linalgOp.getLoc(), spirv::Scope::Workgroup, spirv::Scope::Workgroup,
           spirv::MemorySemantics::AcquireRelease);
     }
     rewriter.eraseOp(linalgOp);
     return success();
   }

  private:
   /// Flag to signify if Linalg on tensors path is being used. The control flow
   /// optimizations implemented on legacy path seems to be failing on this
   /// path. Assuming this overhead is not too much, for now just generated the
   /// extra loops.
   bool usingLinalgOnTensorsPath;
 };

 /// Given the workload return the workgroup count along X obtained by
 /// linearizing the workload and dividing by the workgroup size.
 static Value getWorkgroupCountX(OpBuilder &builder, Location loc,
                                 ArrayRef<Value> values,
                                 int64_t workgroupSizeX) {
   AffineExpr expr = builder.getAffineConstantExpr(1);
   for (auto val : enumerate(values)) {
     expr = expr * builder.getAffineSymbolExpr(val.index());
   }
   expr = expr.ceilDiv(workgroupSizeX);
   return linalg::applyMapToValues(
       builder, loc, AffineMap::get(0, values.size(), expr), values)[0];
 }

 /// Map linalg operation to execute on GPU in parallel by mapping the parallel
 /// loops to "GlobalInvocationId".
 template <typename LinalgOpTy>
 struct MapLinalgOpToGlobalInvocationId
     : public OpConversionPattern<LinalgOpTy> {
   MapLinalgOpToGlobalInvocationId(MLIRContext *context,
                                   bool usingLinalgOnTensorsPath,
                                   PatternBenefit benefit = 1)
       : OpConversionPattern<LinalgOpTy>(context, benefit),
         usingLinalgOnTensorsPath(usingLinalgOnTensorsPath) {}

   LogicalResult matchAndRewrite(
       LinalgOpTy linalgOp, ArrayRef<Value> operands,
       ConversionPatternRewriter &rewriter) const override {
     // If marker exists do nothing.
     if (hasMarker(linalgOp)) return failure();
     FuncOp funcOp = linalgOp->template getParentOfType<FuncOp>();
     if (!funcOp) return failure();
     Optional<linalg::LinalgLoops> loops =
         linalg::linalgLowerOpToLoops<scf::ParallelOp>(rewriter, linalgOp);
     if (!loops) return failure();

     SmallVector<int64_t, 3> workgroupSize(3, 1);
     if (!loops.getValue().empty()) {
       scf::ParallelOp pLoopOp = dyn_cast<scf::ParallelOp>(loops.getValue()[0]);
       // If there are parallel loops partition them to threads using global
       // invocation ID.
       if (pLoopOp) {
         pLoopOp = collapseParallelLoops(rewriter, pLoopOp);
         if (!pLoopOp) return failure();
         if (failed(mapToGlobalInvocationId(rewriter, pLoopOp)))
           return rewriter.notifyMatchFailure(
               linalgOp, "mapping to GlobalInvocationID failed");
         workgroupSize = {32, 1, 1};
       }
     }
     if (usingLinalgOnTensorsPath) {
       WorkgroupCountRegionBuilder regionBuilder =
           [&workgroupSize](
               OpBuilder &b, Location loc,
               std::array<Value, 3> workload) -> std::array<Value, 3> {
         Value one = b.create<ConstantIndexOp>(loc, 1);
         return {getWorkgroupCountX(b, loc, workload, workgroupSize[0]), one,
                 one};
       };
       if (failed(defineWorkgroupCountRegion(rewriter, funcOp, regionBuilder))) {
         return failure();
       }
     } else {
       // TODO (GH-4901): Only support static shapes on this path. This should be
       // removed when moved to linalg on tensors.
       Optional<SmallVector<int64_t, 4>> staticLoopRange =
           linalg::getStaticLoopRanges(linalgOp);
       if (!staticLoopRange ||
           llvm::any_of(staticLoopRange.getValue(), [](int64_t d) {
             return d == ShapedType::kDynamicSize;
           })) {
         return linalgOp.emitError("failed to find statlc loop bounds");
       }
       ArrayRef<int64_t> parallelLoopRange(staticLoopRange.getValue());
       unsigned numOuterParallel = getNumOuterParallelLoops(linalgOp);
       parallelLoopRange = parallelLoopRange.take_front(numOuterParallel);
       WorkgroupCountRegionBuilder regionBuilder =
           [&parallelLoopRange, &workgroupSize](
               OpBuilder &b, Location loc,
               std::array<Value, 3> workload) -> std::array<Value, 3> {
         Value one = b.create<ConstantIndexOp>(loc, 1);
         auto values = llvm::to_vector<4>(
             llvm::map_range(parallelLoopRange, [&](int64_t dim) -> Value {
               return b.create<ConstantIndexOp>(loc, dim);
             }));
         return {getWorkgroupCountX(b, loc, values, workgroupSize[0]), one, one};
       };
       if (failed(defineWorkgroupCountRegion(rewriter, funcOp, regionBuilder))) {
         return failure();
       }
     }
     if (failed(updateWorkGroupSize(funcOp, workgroupSize))) {
       return failure();
     }
     rewriter.eraseOp(linalgOp);
     return success();
   }

  private:
   /// Flag to signify if Linalg on tensors path is being used. This changes the
   /// way the number of workgroups is computed. With the linalg on tensors path,
   /// the hal.executable.entry_point will be updated to contain a region that
   /// gives the number of workgroups to use.
   bool usingLinalgOnTensorsPath;
 };

 /// Remove the linalg.range operation created when lowering to loops.
 struct RemoveLinalgRange : public OpConversionPattern<linalg::RangeOp> {
   using OpConversionPattern<linalg::RangeOp>::OpConversionPattern;
   LogicalResult matchAndRewrite(
       linalg::RangeOp rangeOp, ArrayRef<Value> operands,
       ConversionPatternRewriter &rewriter) const override {
     if (!rangeOp.getResult().use_empty()) return failure();
     rewriter.eraseOp(rangeOp);
     return success();
   }
 };
 }  // namespace

 // Applies tiling followed to load/store optimized size then distribute on
 // incovations.
 static LogicalResult linalgCopyTileAndDistribute(
     linalg::CopyOp copyOp, ArrayRef<Value> operands,
     ConversionPatternRewriter &rewriter) {
   linalg::LinalgTilingOptions options;
   // Tile to memory access of 128bits as those tend to be optimal on most GPUs.
   constexpr unsigned vecLoadBits = 128;
   unsigned elementBits =
       copyOp.getSource().getType().cast<MemRefType>().getElementTypeBitWidth();
   if (elementBits == 0 || vecLoadBits % elementBits != 0) return failure();
   unsigned numElement = vecLoadBits / elementBits;
   options.setTileSizes({1, numElement})
       .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops);
   Optional<linalg::TiledLinalgOp> tiledOp = linalg::tileLinalgOp(
       rewriter, cast<linalg::LinalgOp>(copyOp.getOperation()), options);
   if (!tiledOp) return failure();
   if (tiledOp->loops.empty()) return success();
   setMarker(tiledOp->op, getVectorizeMarker());
   auto pLoopOp = cast<scf::ParallelOp>(tiledOp->loops[0]);
   return distributeCopyOp(copyOp, pLoopOp, rewriter);
 }

 namespace {
 // Pattern to tile and distribute linalg::CopyOp.
 struct TileAndDistributeCopyOp : public OpConversionPattern<linalg::CopyOp> {
   using OpConversionPattern<linalg::CopyOp>::OpConversionPattern;
   LogicalResult matchAndRewrite(
       linalg::CopyOp linalgOp, ArrayRef<Value> operands,
       ConversionPatternRewriter &rewriter) const override {
     if (!hasMarker(linalgOp, getCopyToWorkgroupMemoryMarker())) {
       return failure();
     }
     if (failed(linalgCopyTileAndDistribute(linalgOp, operands, rewriter))) {
       return failure();
     }

     // Insert a barrier if read or write shared memory.
     if (llvm::any_of(linalgOp.getOperands(), [](Value output) {
           return output.getType().cast<MemRefType>().getMemorySpace() ==
                  getWorkgroupMemorySpace();
         })) {
       rewriter.create<spirv::ControlBarrierOp>(
           linalgOp.getLoc(), spirv::Scope::Workgroup, spirv::Scope::Workgroup,
           spirv::MemorySemantics::AcquireRelease);
     }
     rewriter.eraseOp(linalgOp);
     return success();
   }
 };
 }  // namespace

 void populateLinalgTileAndDistributePatterns(
     MLIRContext *context, OwningRewritePatternList &patterns) {
   patterns.insert<TileAndDistributeCopyOp>(context);
 }

 void ConvertToGPUPass::runOnOperation() {
   MLIRContext *context = &getContext();
   ConversionTarget target(*context);
   // After this pass Linalg and scf.parallel ops should be gone.
   target.addIllegalOp<scf::ParallelOp>();
   target.addIllegalDialect<linalg::LinalgDialect>();
   // Reshape ops are treated legal since they just change the way the underlying
   // buffer is viewed. These are legalized downstream. They become no ops when
   // lowering to SPIR-V since the SPIR-V code uses linearized arrays.
   target.addLegalOp<linalg::ReshapeOp>();
   // Let the rest fall through.
   target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });

   OwningRewritePatternList patterns;

   patterns.insert<
       MapLinalgOpToGlobalInvocationId<linalg::CopyOp>,
       MapLinalgOpToGlobalInvocationId<linalg::FillOp>,
       MapLinalgOpToGlobalInvocationId<linalg::GenericOp>,
       MapLinalgOpToGlobalInvocationId<linalg::IndexedGenericOp>,
       MapLinalgOpToLocalInvocationId<linalg::ConvOp>,
       MapLinalgOpToLocalInvocationId<linalg::ConvInputNWCFilterWCFOp>,
       MapLinalgOpToLocalInvocationId<linalg::ConvInputNHWCFilterHWCFOp>,
       MapLinalgOpToLocalInvocationId<linalg::ConvInputNDHWCFilterDHWCFOp>,
       MapLinalgOpToLocalInvocationId<linalg::CopyOp>,
       MapLinalgOpToLocalInvocationId<linalg::FillOp>,
       MapLinalgOpToLocalInvocationId<linalg::GenericOp>,
       MapLinalgOpToLocalInvocationId<linalg::IndexedGenericOp>,
       MapLinalgOpToLocalInvocationId<linalg::MatmulOp>,
       MapLinalgOpToLocalInvocationId<linalg::BatchMatmulOp>,
       MapLinalgOpToLocalInvocationId<linalg::PoolingMaxOp>,
       MapLinalgOpToLocalInvocationId<linalg::PoolingMinOp>,
       MapLinalgOpToLocalInvocationId<linalg::PoolingSumOp>, RemoveLinalgRange,
       SerializeParallelLoopPattern>(context, options.usingLinalgOnTensors);
   FrozenRewritePatternList frozenPatterns(std::move(patterns));

   for (FuncOp funcOp : getOperation().getInnerModule().getOps<FuncOp>()) {
     if (!isEntryPoint(funcOp)) continue;
     Region &body = funcOp.getBody();
     if (!llvm::hasSingleElement(body)) {
       funcOp.emitError("unhandled dispatch function with multiple blocks");
       return signalPassFailure();
     }
     if (failed(applyFullConversion(funcOp, target, frozenPatterns)))
       return signalPassFailure();
   }
 }

 std::unique_ptr<OperationPass<IREE::HAL::ExecutableTargetOp>>
 createConvertToGPUPass(const SPIRVCodegenOptions &options) {
   return std::make_unique<ConvertToGPUPass>(options);
 }

 static PassRegistration<ConvertToGPUPass> pass(
     "iree-codegen-convert-to-gpu", "Map tiled linalg and loop ops to GPU", [] {
       SPIRVCodegenOptions options = getSPIRVCodegenOptionsFromClOptions();
       return std::make_unique<ConvertToGPUPass>(options);
     });

 }  // namespace iree_compiler
 }  // namespace mlir