compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2020 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/compiler/Codegen/SPIRV/KernelConfig.h"

 #include <functional>
 #include <numeric>

 #include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree/compiler/Codegen/Common/UserConfig.h"
 #include "iree/compiler/Codegen/Dialect/LoweringConfig.h"
 #include "iree/compiler/Codegen/SPIRV/Utils.h"
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/MarkerUtils.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Matchers.h"

 #define DEBUG_TYPE "iree-spirv-kernel-config"

 using llvm::APIntOps::GreatestCommonDivisor;

 // The default number of subgroups to use per workgroup.
 constexpr unsigned numSubgroupsPerWorkgroup = 4;
 // The default number of tiles along each dimension to use per workgroup.
 constexpr unsigned numTilesPerSubgroupDim = 2;

 constexpr int kMaxVectorNumBits = 128;

 namespace mlir {
 namespace iree_compiler {

 using CodeGenPipeline = IREE::Codegen::DispatchLoweringPassPipeline;

 //===----------------------------------------------------------------------===//
 // Utility Functions
 //===----------------------------------------------------------------------===//

 bool isMatmulOrBatchMatmul(linalg::LinalgOp linalgOp) {
   return linalg::isaContractionOpInterface(linalgOp) &&
          llvm::is_contained({2u, 3u}, linalgOp.getNumParallelLoops());
 }

 //===----------------------------------------------------------------------===//
 // Convolution Default Configuration
 //===----------------------------------------------------------------------===//

 /// Decides the tiling and distribution parameters for one convolution
 /// dimension. Returns true if we can succesfully deduce.
 ///
 /// - `inputDim` is the size of the dimension to be distributed.
 /// - `residualThreads` is the remaining threads we can distribute.
 /// - `residualTilingFactor` indicates the remaining tiling scale factor.
 /// - `wgDimSize` will be updated with the decided workgroup dimension size.
 /// - `wgTileSize` will be updated with the decided workgroup tile size.
 static bool tileConvOneDim(const int64_t inputDim, const bool isInnerMostDim,
                            int64_t &residualThreads,
                            int64_t &residualTilingFactor, int64_t &wgDimSize,
                            int64_t &wgTileSize) {
   const int64_t lb = isInnerMostDim ? 2 : 1;
   for (int64_t dim = residualThreads; dim >= lb; dim >>= 1) {
     int64_t chosenTileSize = 0;
     if (isInnerMostDim) {
       // Handle 4 elements per thread for the innermost dimension. We need
       // this for vectorized load.
       chosenTileSize = 4;
       if (inputDim % (dim * chosenTileSize) != 0) continue;
     } else {
       for (int64_t t = residualTilingFactor; t >= 1; t >>= 1)
         if (inputDim % (dim * t) == 0) {
           chosenTileSize = t;
           break;
         }
     }
     if (chosenTileSize) {
       wgDimSize = dim;
       wgTileSize = dim * chosenTileSize;
       residualThreads /= dim;
       residualTilingFactor /= chosenTileSize;
       return true;
     }
   }
   return false;
 };

 /// Decides the tiling and distribution parameters for two convolution window
 /// dimensions to two workgroup dimensions as a square. Returns true if we can
 /// succesfully deduce.
 static bool tileConvSquare(const int64_t oh, const int64_t ow,
                            int64_t &residualThreads,
                            int64_t &residualTilingFactor,
                            MutableArrayRef<int64_t> wgDimSizes,
                            MutableArrayRef<int64_t> wgTileSizes) {
   assert(wgDimSizes.size() == 2 && wgTileSizes.size() == 2);

   const unsigned log2Threads = llvm::Log2_64(residualThreads);
   if (oh == ow && residualThreads != 1 && log2Threads % 2 == 0) {
     const int64_t yz = 1ll << (log2Threads / 2);

     int64_t chosenTileSize = 1ll << (llvm::Log2_64(residualTilingFactor) / 2);
     while (chosenTileSize >= 1 && ow % (yz * chosenTileSize) != 0) {
       chosenTileSize >>= 1;
     }

     if (chosenTileSize != 0) {
       wgDimSizes.front() = wgDimSizes.back() = yz;
       wgTileSizes.front() = wgTileSizes.back() = yz * chosenTileSize;
       return true;
     }
   }
   return false;
 }

 namespace detail {

 LogicalResult setConvOpConfig(linalg::LinalgOp linalgOp,
                               const int64_t subgroupSize,
                               const int64_t bestTilingFactor) {
   LLVM_DEBUG(llvm::dbgs() << "trying to deduce config as convolution...\n");
   Type inputType = linalgOp.getDpsInputOperand(0)->get().getType();
   ArrayRef<int64_t> inputShape = inputType.cast<ShapedType>().getShape();
   Type outputType = linalgOp.getDpsInitOperand(0)->get().getType();
   ArrayRef<int64_t> outputShape = outputType.cast<ShapedType>().getShape();

   const bool isNCHW = isa<linalg::Conv2DNchwFchwOp>(*linalgOp);
   const bool isNHWC = isa<linalg::Conv2DNhwcHwcfOp>(*linalgOp) ||
                       isa<linalg::DepthwiseConv2DNhwcHwcOp>(*linalgOp);
   if (!isNCHW & !isNHWC) return success();

   const int icIndex = isNHWC ? 3 : 1;
   const int ohIndex = isNHWC ? 1 : 2;
   const int owIndex = isNHWC ? 2 : 3;
   const int ocIndex = isNHWC ? 3 : 1;

   if (ShapedType::isDynamic(inputShape[icIndex]) ||
       llvm::any_of(outputShape.drop_front(), ShapedType::isDynamic)) {
     return success();
   }

   const int64_t ic = inputShape[icIndex], oc = outputShape[ocIndex];
   const int64_t oh = outputShape[ohIndex], ow = outputShape[owIndex];

   // The conversion pipeline requires the input channel dimension to be some
   // multipler of four, or less than four.
   if (!(ic % 4 == 0 || ic < 4)) return success();

   // The core idea is to distribute the convolution dimensions to the workgroup
   // Z/Y/X dimensions, with each thread in a workgroup handling multiple vector
   // elements. We try to 1) utilize all threads in a subgroup, and 2) handle an
   // optimal tile size along each dimension.

   int64_t residualThreads = subgroupSize;
   int64_t residualTilingFactor = bestTilingFactor;

   SmallVector<int64_t, 3> workgroupSize(3, 1);  // (X, Y, Z)
   SmallVector<int64_t> workgroupTileSizes(4, 0);

   if (isNCHW) {
     // OW -> x, OH -> y, OC -> z
     if (!tileConvOneDim(ow, /*isInnerMostDim=*/true, residualThreads,
                         residualTilingFactor, workgroupSize[0],
                         workgroupTileSizes[3]) ||
         !tileConvOneDim(oh, /*isInnerMostDim=*/false, residualThreads,
                         residualTilingFactor, workgroupSize[1],
                         workgroupTileSizes[2]) ||
         !tileConvOneDim(oc, /*isInnerMostDim=*/false, residualThreads,
                         residualTilingFactor, workgroupSize[2],
                         workgroupTileSizes[1])) {
       return success();
     }
   } else {
     // OC -> x
     if (!tileConvOneDim(oc, /*isInnerMostDim=*/true, residualThreads,
                         residualTilingFactor, workgroupSize[0],
                         workgroupTileSizes[3]))
       return success();

     // Deduce the configruation for the OW and OH dimension. Try to make them
     // even if possible given we typically have images with the same height
     // and width.
     const bool tileToSquare = tileConvSquare(
         oh, ow, residualThreads, residualTilingFactor,
         llvm::makeMutableArrayRef(workgroupSize).drop_front(),
         llvm::makeMutableArrayRef(workgroupTileSizes).drop_front().drop_back());

     // Otherwise treat OW and OH separately to allow them to have different
     // number of threads and tiling size.
     if (!tileToSquare) {
       if (!tileConvOneDim(ow, /*isInnerMostDim=*/false, residualThreads,
                           residualTilingFactor, workgroupSize[1],
                           workgroupTileSizes[2]) ||
           !tileConvOneDim(oh, /*isInnerMostDim=*/false, residualThreads,
                           residualTilingFactor, workgroupSize[2],
                           workgroupTileSizes[1])) {
         return success();
       }
     }
   }

   SmallVector<int64_t> threadTileSizes(4, 0);
   for (int i = 1; i <= 3; ++i) {
     threadTileSizes[i] = workgroupTileSizes[i] / workgroupSize[3 - i];
   }

   auto pipeline = CodeGenPipeline::SPIRVBaseVectorize;
   TileSizesListType tileSizes;
   tileSizes.push_back(workgroupTileSizes);
   tileSizes.push_back(threadTileSizes);
   // Tiling along reduction dimensions
   if (isa<linalg::Conv2DNchwFchwOp>(linalgOp)) {
     tileSizes.push_back({0, 0, 0, 0, 4, 1, 1});  // (N, OC, OH, OW, IC, FH, FW)
   } else if (isa<linalg::Conv2DNhwcHwcfOp>(linalgOp)) {
     tileSizes.push_back({0, 0, 0, 0, 1, 1, 4});  // (N, OH, OW, OC, FH, FW, IC)
   } else if (isa<linalg::DepthwiseConv2DNhwcHwcOp>(linalgOp)) {
     tileSizes.push_back({0, 0, 0, 0, 1, 1});  // (N, OH, OW, C, FH, FW)
   } else {
     return success();
   }
   // Tile along OH by size 1 to enable downsizing 2-D convolution to 1-D.
   SmallVector<int64_t> windowTileSizes(4, 0);
   windowTileSizes[ohIndex] = 1;
   tileSizes.push_back(windowTileSizes);

   auto funcOp = linalgOp->getParentOfType<func::FuncOp>();
   return setOpConfigAndEntryPointFnTranslation(funcOp, linalgOp, tileSizes,
                                                pipeline, workgroupSize);
 }

 }  // namespace detail

 //===----------------------------------------------------------------------===//
 // Matmul Default Configuration
 //===----------------------------------------------------------------------===//

 /// Given the linalg `op` with `lhsShape` and `rhsShape`, tries to treat as a
 /// (batch) matmul like op and deduce the index of the loop corresponding to
 /// B/M/N/K dimension respectively. Returns -1 as the index if unable to deduce.
 std::tuple<int, int, int, int> getMatmulBMNKIndex(linalg::LinalgOp op,
                                                   int *lastParallelDim) {
   OpOperand *lhs = op.getDpsInputOperand(0);
   OpOperand *rhs = op.getDpsInputOperand(1);
   auto lhsShape = lhs->get().getType().cast<ShapedType>().getShape();
   auto rhsShape = rhs->get().getType().cast<ShapedType>().getShape();

   auto lhsLoopIndices = llvm::to_vector(llvm::map_range(
       llvm::seq<int>(0, lhsShape.size()),
       [&](int i) { return op.getMatchingIndexingMap(lhs).getDimPosition(i); }));
   auto rhsLoopIndices = llvm::to_vector(llvm::map_range(
       llvm::seq<int>(0, rhsShape.size()),
       [&](int i) { return op.getMatchingIndexingMap(rhs).getDimPosition(i); }));

   // Figure out what dimension each loop corresponds to.
   int bIndex = -1, mIndex = -1, nIndex = -1, kIndex = -1;
   for (unsigned i = 0; i < op.getNumLoops(); ++i) {
     if (linalg::isReductionIterator(op.getIteratorTypesArray()[i])) {
       kIndex = i;
       continue;
     }

     const bool inLHS = llvm::is_contained(lhsLoopIndices, i);
     const bool inRHS = llvm::is_contained(rhsLoopIndices, i);
     if (inLHS && inRHS) {
       bIndex = i;
     } else if (inLHS) {
       // For cases where we have two parallel dimensions only accessed by
       // the LHS, treat the outer one of them as the batch dimension.
       if (mIndex >= 0 && bIndex < 0) bIndex = mIndex;
       mIndex = i;
     } else if (inRHS) {
       // For cases where we have two parallel dimensions only accessed by
       // the RHS, treat the outer one of them as the batch dimension.
       if (nIndex >= 0 && bIndex < 0) bIndex = nIndex;
       nIndex = i;
     }
     if (lastParallelDim) *lastParallelDim = i;
   }

   LLVM_DEBUG({
     llvm::dbgs() << "(B, M, N, K) indices = (" << bIndex << ", " << mIndex
                  << ", " << nIndex << ", " << kIndex << ")\n";
   });
   return {bIndex, mIndex, nIndex, kIndex};
 }

 /// Decides the tiling and distribution parameters for matmul's N dimension to
 /// workgroup X dimension.
 static bool tileMatmulNToWorkgroupX(const int64_t dimN,
                                     const int64_t bestThreadN,
                                     int64_t &residualThreads,
                                     const int64_t bestX,
                                     int64_t &residualTilingFactor,
                                     int64_t &wgDimSize, int64_t &wgTileSize) {
   // Deduce the configuration for the N dimension. Start with the best workgroup
   // X size, and reduce by a factor of two each time.
   for (int64_t x = bestX; x >= 2; x >>= 1) {
     // Handle 4 elements per thread for the innermost dimension. We need this
     // for vectorized load.
     int64_t chosenTileSize = bestThreadN;
     if (dimN % (x * chosenTileSize) == 0) {
       wgDimSize = x;
       wgTileSize = x * chosenTileSize;
       residualThreads /= x;
       assert(residualTilingFactor % chosenTileSize == 0);
       residualTilingFactor /= chosenTileSize;
       return true;
     }
   }
   return false;
 }

 /// Decides the tiling and distribution parameters for matmul's M dimension to
 /// workgroup Y dimension.
 static bool tileMatmulMToWorkgroupY(const int64_t dimM,
                                     const int64_t bestThreadM,
                                     int64_t &residualThreads,
                                     const int64_t bestY,
                                     int64_t &residualTilingFactor,
                                     int64_t &wgDimSize, int64_t &wgTileSize) {
   // Deduce the configuration for the M dimension. Start with the best workgroup
   // Y size, and reduce by a factor of two each time.
   for (int64_t y = residualThreads; y >= 1; y >>= 1) {
     int64_t chosenTileSize = 0;
     // Reduce the thread tiling size by one each time. We read one row each
     // time; so it's fine to not be some power of two here.
     for (int64_t t = bestThreadM; t >= 1; --t) {
       if (dimM % (y * t) == 0) {
         chosenTileSize = t;
         break;
       }
     }
     if (chosenTileSize) {
       wgDimSize = y;
       wgTileSize = y * chosenTileSize;
       assert(residualTilingFactor > chosenTileSize);
       residualTilingFactor -= chosenTileSize;
       return true;
     }
   }
   return false;
 }

 /// Decides the tiling parameters for matmul's K dimension.
 static bool tileMatmulK(const int64_t dimK, const int64_t residualTilingFactor,
                         int64_t &tileSize) {
   // Deduce the configuration for the K dimension. We need some power of two
   // here so that we can do vector load.
   for (int64_t t = llvm::PowerOf2Floor(residualTilingFactor); t >= 2; t >>= 1) {
     if (dimK % t == 0) {
       tileSize = t;
       return true;
     }
   }
   return false;
 }

 int64_t getTileBytes(int64_t mTileSize, int64_t nTileSize, int64_t kTileSize,
                      int64_t elementBits) {
   const int64_t count =
       (mTileSize + nTileSize) *
       (kTileSize + detail::bankConflictReductionPaddingBits / elementBits);
   return (elementBits / 8) * count;
 }

 int64_t getMultiBufferMemoryUsage(int64_t singleBufferBytes, unsigned depth) {
   return singleBufferBytes * (depth ? depth : 1);
 };

 /// Tries to adjust workgroup and tile sizes to enable vector load for both
 /// matmul LHS and RHS. Returns false only when it's not beneficial to promote.
 static bool adjustToVectorLoad(ArrayRef<int64_t> dimMNKSize, int64_t &mTileSize,
                                int64_t &nTileSize, int64_t &kTileSize,
                                SmallVectorImpl<int64_t> &wgSize,
                                const int64_t subgroupSize, int64_t vectorSize) {
   const int64_t totalThreads = wgSize[0] * wgSize[1] * wgSize[2];
   LLVM_DEBUG(llvm::dbgs() << "initial total thread = " << totalThreads << "\n");
   if (totalThreads <= subgroupSize) return false;

   const bool canVectorLoadLHS = canPerformVectorAccessUsingAllThreads(
       {mTileSize, kTileSize}, totalThreads, vectorSize);
   const bool canVectorLoadRHS = canPerformVectorAccessUsingAllThreads(
       {kTileSize, nTileSize}, totalThreads, vectorSize);
   LLVM_DEBUG(llvm::dbgs() << "LHS vector load: " << canVectorLoadLHS << "\n");
   LLVM_DEBUG(llvm::dbgs() << "RHS vector load: " << canVectorLoadRHS << "\n");

   // If we can perform vector load of neither, just don't use shared memory.
   if (!canVectorLoadLHS && !canVectorLoadRHS) return false;

   // If we can only perform vector load of one operands, adjust the tiling
   // scheme to see if we can make both work. Increase K to load more data for
   // the smaller tile; decrease M or N, for the larger tile.
   if (canVectorLoadLHS && !canVectorLoadRHS) {
     for (const int scale : {2, 4}) {
       const int64_t newKTileSize = kTileSize * scale;
       if (dimMNKSize[2] % newKTileSize != 0) continue;
       const int64_t newMTileSize = mTileSize / scale;
       const int64_t newWgMDim = wgSize[1] / scale;
       if (newMTileSize == 0 || newWgMDim == 0) continue;
       const int64_t newCount = wgSize[0] * newWgMDim * wgSize[2];
       if (newCount <= subgroupSize) continue;
       if (!canPerformVectorAccessUsingAllThreads({newMTileSize, newKTileSize},
                                                  newCount, vectorSize) ||
           !canPerformVectorAccessUsingAllThreads({newKTileSize, nTileSize},
                                                  newCount, vectorSize)) {
         continue;
       }
       LLVM_DEBUG({
         llvm::dbgs() << "initial [M, N, K] tile size = [" << mTileSize << ", "
                      << nTileSize << ", " << kTileSize << "]\n";
         llvm::dbgs() << "revised [M, N, K] tile size = [" << newMTileSize
                      << ", " << nTileSize << ", " << newKTileSize << "]\n";
       });
       mTileSize = newMTileSize;
       kTileSize = newKTileSize;
       wgSize[1] = newWgMDim;
       break;
     }
   }
   // TODO: improve (!canVectorLoadLHS && canVectorLoadRHS)

   return true;
 }

 /// Tries to adjust workgorup and tile sizes to promote matmul LHS and RHS and
 /// returns true if it's beneficial to promote.
 static bool adjustToPromote(ArrayRef<int64_t> dimMNKSize, int64_t &mTileSize,
                             int64_t &nTileSize, int64_t &kTileSize,
                             SmallVectorImpl<int64_t> &wgSize,
                             unsigned &pipelineDepth, const int subgroupSize,
                             const int maxBytes, const int elementBits) {
   LLVM_DEBUG(llvm::dbgs() << "subgroup size = " << subgroupSize << "\n");
   const int vectorSize = kMaxVectorNumBits / elementBits;
   if (!adjustToVectorLoad(dimMNKSize, mTileSize, nTileSize, kTileSize, wgSize,
                           subgroupSize, vectorSize))
     return false;

   // Don't do multibuffering if the inner reduction loop is folded out.
   if (dimMNKSize[2] == kTileSize) pipelineDepth = 1;

   auto usedBytes = getTileBytes(mTileSize, nTileSize, kTileSize, elementBits);

   LLVM_DEBUG(llvm::dbgs() << "initial multibuffering bytes = "
                           << getMultiBufferMemoryUsage(usedBytes, pipelineDepth)
                           << "\n");

   // First try to fit the given tile sizes with the largest pipelining depth
   // possible.
   do {
     if (getMultiBufferMemoryUsage(usedBytes, pipelineDepth) <= maxBytes)
       return true;
   } while (pipelineDepth-- > 1);

   // If we can't fit in workgroup memory, don't multibuffer.
   pipelineDepth = 1;

   // Using too much workgroup memory. Try to reduce the tile size for X/Y once
   // by a factor of two.
   int64_t &wgDimSize = wgSize[0] > wgSize[1] ? wgSize[0] : wgSize[1];
   int64_t &tileSize = wgSize[0] > wgSize[1] ? nTileSize : mTileSize;
   assert(wgDimSize % 2 == 0);
   wgDimSize /= 2;
   tileSize /= 2;

   int64_t totalThreads = wgSize[0] * wgSize[1] * wgSize[2];
   LLVM_DEBUG(llvm::dbgs() << "revised total thread = " << totalThreads << "\n");
   usedBytes = getTileBytes(mTileSize, nTileSize, kTileSize, elementBits);
   LLVM_DEBUG(llvm::dbgs() << "revised tile bytes = " << usedBytes << "\n");
   return totalThreads > subgroupSize && usedBytes <= maxBytes;
 }

 namespace detail {

 LogicalResult setMatmulOpConfig(spirv::ResourceLimitsAttr limits,
                                 linalg::LinalgOp op,
                                 std::array<int64_t, 2> bestWorkgroupSizeXY,
                                 std::array<int64_t, 3> bestThreadTileSizeMNK,
                                 bool enablePromotion,
                                 unsigned softwarePipelineDepth) {
   LLVM_DEBUG(llvm::dbgs() << "trying to deduce config as matmul...\n");
   OpOperand *lhs = op.getDpsInputOperand(0);
   OpOperand *rhs = op.getDpsInputOperand(1);

   auto lhsType = lhs->get().getType().cast<ShapedType>();
   auto rhsType = rhs->get().getType().cast<ShapedType>();
   auto elementBits = lhsType.getElementType().getIntOrFloatBitWidth();
   if (elementBits != 16 && elementBits != 32) return success();

   ArrayRef<int64_t> lhsShape = lhsType.getShape();
   ArrayRef<int64_t> rhsShape = rhsType.getShape();
   if (llvm::any_of(lhsShape, ShapedType::isDynamic)) return success();
   if (llvm::any_of(rhsShape, ShapedType::isDynamic)) return success();

   assert(llvm::is_contained({2u, 3u}, op.getNumParallelLoops()));

   int lastParallelDim = -1;
   const auto [bIndex, mIndex, nIndex, kIndex] =
       getMatmulBMNKIndex(op, &lastParallelDim);
   if (mIndex < 0 || nIndex < 0 || kIndex < 0) return success();
   const bool isBM = bIndex >= 0;

   SmallVector<int64_t, 4> loopRanges = op.getStaticLoopRanges();
   const unsigned numLoops = loopRanges.size();

   const int64_t dimM = loopRanges[mIndex];
   const int64_t dimK = loopRanges[kIndex];
   const int64_t dimN = loopRanges[nIndex];

   // The core idea is to distribute the matmul M/N dimension to the workgroup
   // Y/X dimension, with each thread in a workgroup handling multiple vector
   // elements. We start from the best (X, Y) and the tiling sizes for (M, N, K)
   // and try different configurations by scaling them down until we find a
   // configuration that can perfectly tile the input matmul.

   const int64_t bestThreadM = bestThreadTileSizeMNK[0],
                 bestThreadN = bestThreadTileSizeMNK[1],
                 bestThreadK = bestThreadTileSizeMNK[2];

   int64_t bestX = bestWorkgroupSizeXY[0], bestY = bestWorkgroupSizeXY[1];
   // We will deduce a configuration first for x and then y. But look at y here
   // to see if the problem size is too small; for such cases, "shift" the
   // parallelism to x.
   if (dimM < bestThreadM) {
     int64_t factor = llvm::PowerOf2Ceil(llvm::divideCeil(bestThreadM, dimM));
     bestX *= factor;
     bestY = llvm::divideCeil(bestY, factor);
   }

   LLVM_DEBUG({
     llvm::dbgs() << "best thread tile size (M, N, K) = (" << bestThreadM << ", "
                  << bestThreadN << ", " << bestThreadK << ")\n";
     llvm::dbgs() << "best workgroup size (X, Y) = (" << bestX << ", " << bestY
                  << ")\n";
   });

   int64_t residualThreads = bestX * bestY;
   int64_t residualTilingFactor = (bestThreadM + bestThreadK) * bestThreadN;

   SmallVector<int64_t, 3> workgroupSize(3, 1);  // (X, Y, Z)
   SmallVector<int64_t> workgroupTileSizes(numLoops, 0);
   SmallVector<int64_t> reductionTileSizes(numLoops, 0);

   if (isBM) workgroupTileSizes[bIndex] = 1;

   if (!tileMatmulNToWorkgroupX(dimN, bestThreadN, residualThreads, bestX,
                                residualTilingFactor, workgroupSize[0],
                                workgroupTileSizes[nIndex]) ||
       !tileMatmulMToWorkgroupY(dimM, bestThreadM, residualThreads, bestY,
                                residualTilingFactor, workgroupSize[1],
                                workgroupTileSizes[mIndex]) ||
       !tileMatmulK(dimK, residualTilingFactor, reductionTileSizes[kIndex])) {
     return success();
   }
   LLVM_DEBUG({
     llvm::dbgs() << "workgroup tile size before promotion = (";
     llvm::interleaveComma(workgroupTileSizes, llvm::dbgs());
     llvm::dbgs() << ")\n";
     llvm::dbgs() << "reduction tile size before promotion = (";
     llvm::interleaveComma(reductionTileSizes, llvm::dbgs());
     llvm::dbgs() << ")\n";
     llvm::dbgs() << "workgroup size before promotion = (";
     llvm::interleaveComma(workgroupSize, llvm::dbgs());
     llvm::dbgs() << ")\n";
   });

   const int subgroupSize = limits.getSubgroupSize();
   const int maxBytes = limits.getMaxComputeSharedMemorySize();

   // We want a 2-stage pipeline without multi-buffering if the depth is 0 to
   // keep the default for compilation configs that don't specify a pipeline
   // depth.
   auto pipelineDepth = softwarePipelineDepth ? softwarePipelineDepth : 1;

   // Try to adjust tiling sizes to fit in shared memory.
   auto usePromotionPipeline =
       enablePromotion &&
       adjustToPromote({dimM, dimN, dimK}, workgroupTileSizes[mIndex],
                       workgroupTileSizes[nIndex], reductionTileSizes[kIndex],
                       workgroupSize, pipelineDepth, subgroupSize, maxBytes,
                       elementBits);

   SmallVector<int64_t> threadTileSizes(numLoops, 0);
   if (isBM) {
     threadTileSizes[bIndex] = workgroupTileSizes[bIndex] / workgroupSize[2];
   }
   threadTileSizes[mIndex] = workgroupTileSizes[mIndex] / workgroupSize[1];
   threadTileSizes[nIndex] = workgroupTileSizes[nIndex] / workgroupSize[0];

   TileSizesListType tileSizes;
   workgroupTileSizes.resize(lastParallelDim + 1);
   threadTileSizes.resize(lastParallelDim + 1);
   tileSizes.push_back(workgroupTileSizes);
   tileSizes.push_back(threadTileSizes);
   tileSizes.push_back(reductionTileSizes);

   // Only the promotion pipeline has multibuffering + pipelining.
   if (usePromotionPipeline) {
     return setOpConfigAndEntryPointFnTranslation(
         op->getParentOfType<func::FuncOp>(), op, tileSizes,
         CodeGenPipeline::SPIRVMatmulPromoteVectorize, workgroupSize,
         pipelineDepth);
   }

   return setOpConfigAndEntryPointFnTranslation(
       op->getParentOfType<func::FuncOp>(), op, tileSizes,
       CodeGenPipeline::SPIRVBaseVectorize, workgroupSize);
 }

 }  // namespace detail

 //===----------------------------------------------------------------------===//
 // Cooperative Matrix Default Configuration
 //===----------------------------------------------------------------------===//

 struct CooperativeMatrixSize {
   int64_t mSize;       // Native cooperative matrix size along M dimension
   int64_t nSize;       // Native cooperative matrix size along N dimension
   int64_t kSize;       // Native cooperative matrix size along K dimension
   int64_t mWarpCount;  // # subgroups along M dimension
   int64_t nWarpCount;  // # subgroups along N dimension
   int64_t mTileCount;  // # tiles per subgroup along M dimension
   int64_t nTileCount;  // # tiles per subgroup along N dimension
   int64_t kTileCount;  // # tiles along K dimension
 };

 /// Returns the cooperative matrix (M, N, K) sizes that are supported by the
 /// target environment and match the given parameters.
 static Optional<CooperativeMatrixSize> getCooperativeMatrixSize(
     spirv::ResourceLimitsAttr resourceLimits, Type aType, Type bType,
     Type cType, int64_t m, int64_t n, int64_t k) {
   auto properties = resourceLimits.getCooperativeMatrixPropertiesNv()
                         .getAsRange<spirv::CooperativeMatrixPropertiesNVAttr>();
   for (auto property : properties) {
     if (property.getAType() != aType || property.getBType() != bType ||
         property.getCType() != cType || property.getResultType() != cType ||
         property.getScope().getValue() != spirv::Scope::Subgroup) {
       continue;  // Cannot use this cooperative matrix configuration
     }

     const unsigned matmulM = property.getMSize();
     const unsigned matmulN = property.getNSize();
     const unsigned matmulK = property.getKSize();
     if (m % matmulM != 0 || n % matmulN != 0 || k % matmulK != 0) continue;

     uint64_t nTotalTileCount = n / matmulN;
     uint64_t mTotalTileCount = m / matmulM;

     uint64_t remainingWarps = numSubgroupsPerWorkgroup;
     uint64_t remainingTiles = numTilesPerSubgroupDim * numTilesPerSubgroupDim;
     uint64_t warpSqrt = 1ull << (llvm::Log2_64(remainingWarps) / 2);
     uint64_t tileSqrt = 1ull << (llvm::Log2_64(remainingTiles) / 2);

     int64_t mWarpCount = 0, nWarpCount = 0;
     int64_t mTileCount = 0, nTileCount = 0;

     // See if the square root can divide mTotalTileCount. If so it means we can
     // distribute to both dimensions evenly. Otherwise, try to distribute to N
     // and then M.
     if (mTotalTileCount > (warpSqrt * tileSqrt) &&
         mTotalTileCount % (warpSqrt * tileSqrt) == 0) {
       mWarpCount = warpSqrt;
       mTileCount = tileSqrt;

       remainingWarps /= warpSqrt;
       remainingTiles /= tileSqrt;

       APInt nGCD = GreatestCommonDivisor(APInt(64, nTotalTileCount),
                                          APInt(64, remainingWarps));
       nWarpCount = nGCD.getSExtValue();
       nTotalTileCount /= nWarpCount;
       remainingWarps /= nWarpCount;

       nGCD = GreatestCommonDivisor(APInt(64, nTotalTileCount),
                                    APInt(64, remainingTiles));
       nTileCount = nGCD.getSExtValue();
     } else {
       APInt nGCD = GreatestCommonDivisor(APInt(64, nTotalTileCount),
                                          APInt(64, remainingWarps));
       nWarpCount = nGCD.getSExtValue();
       nTotalTileCount /= nWarpCount;
       remainingWarps /= nWarpCount;

       nGCD = GreatestCommonDivisor(APInt(64, nTotalTileCount),
                                    APInt(64, remainingTiles));
       nTileCount = nGCD.getSExtValue();
       remainingTiles /= nTileCount;

       APInt mGCD = GreatestCommonDivisor(APInt(64, mTotalTileCount),
                                          APInt(64, remainingWarps));
       mWarpCount = mGCD.getSExtValue();
       mTotalTileCount /= mWarpCount;
       remainingWarps /= mWarpCount;

       mGCD = GreatestCommonDivisor(APInt(64, mTotalTileCount),
                                    APInt(64, remainingTiles));
       mTileCount = mGCD.getSExtValue();
     }

     const uint64_t kTotalTileCount = k / matmulK;
     APInt kGCD = GreatestCommonDivisor(APInt(64, kTotalTileCount),
                                        APInt(64, numTilesPerSubgroupDim));
     int64_t kTileCount = kGCD.getSExtValue();

     LLVM_DEBUG({
       llvm::dbgs() << "chosen cooperative matrix configuration:\n";
       llvm::dbgs() << "  (M, N, K) size = (" << matmulM << ", " << matmulN
                    << ", " << matmulK << ")\n";
       llvm::dbgs() << "  (M, N) subgroup count = (" << mWarpCount << ", "
                    << nWarpCount << ")\n";
       llvm::dbgs() << "  (M, N, K) tile count per subgroup = (" << mTileCount
                    << ", " << nTileCount << ", " << kTileCount << ")\n";
     });
     return CooperativeMatrixSize{matmulM,    matmulN,    matmulK,
                                  mWarpCount, nWarpCount, mTileCount,
                                  nTileCount, kTileCount};
   }
   return llvm::None;
 }

 namespace detail {

 LogicalResult setCooperativeMatrixConfig(const spirv::TargetEnv &targetEnv,
                                          linalg::LinalgOp op) {
   LLVM_DEBUG(llvm::dbgs() << "trying to matmul tensorcore config...\n");
   // This configuration is only for cooperative matrix.
   if (!targetEnv.allows(spirv::Capability::CooperativeMatrixNV) ||
       !targetEnv.allows(spirv::Extension::SPV_NV_cooperative_matrix)) {
     return success();
   }

   if (op.hasDynamicShape()) return success();

   Value lhs = op.getDpsInputOperand(0)->get();
   Value rhs = op.getDpsInputOperand(1)->get();
   Value init = op.getDpsInitOperand(0)->get();

   int lastParallelDim = -1;
   const auto [bIndex, mIndex, nIndex, kIndex] =
       getMatmulBMNKIndex(op, &lastParallelDim);
   if (mIndex < 0 || nIndex < 0 || kIndex < 0) return success();
   const bool isBM = bIndex >= 0;

   SmallVector<int64_t, 4> loopRanges = op.getStaticLoopRanges();

   const int64_t dimM = loopRanges[mIndex];
   const int64_t dimK = loopRanges[kIndex];
   const int64_t dimN = loopRanges[nIndex];
   LLVM_DEBUG({
     llvm::dbgs() << "input matmul shape (B, M, N, K) = ("
                  << (bIndex >= 0 ? loopRanges[bIndex] : -1) << ", " << dimM
                  << ", " << dimN << ", " << dimK << ")\n";
   });

   // TODO: Cooperative matrix support is fairly restricted. We can only have
   // a curated list of fused element wise ops as defined in the extension
   // SPV_NV_cooperative_matrix. Check that once we move bufferization after
   // vectorization.

   auto getElementType = [](Value v) {
     return v.getType().cast<ShapedType>().getElementType();
   };

   spirv::ResourceLimitsAttr resourceLimits = targetEnv.getResourceLimits();
   Optional<CooperativeMatrixSize> coopMatSize = getCooperativeMatrixSize(
       resourceLimits, getElementType(lhs), getElementType(rhs),
       getElementType(init), dimM, dimN, dimK);
   if (!coopMatSize) return success();

   auto pipeline = IREE::Codegen::DispatchLoweringPassPipeline::
       SPIRVCooperativeMatrixVectorize;

   std::array<int64_t, 3> workgroupSize{
       coopMatSize->nWarpCount * resourceLimits.getSubgroupSize(),
       coopMatSize->mWarpCount, 1};

   SmallVector<int64_t> vectorSizes(kIndex + 1, 0);
   if (isBM) vectorSizes[bIndex] = 1;
   vectorSizes[mIndex] = coopMatSize->mSize;
   vectorSizes[nIndex] = coopMatSize->nSize;
   vectorSizes[kIndex] = coopMatSize->kSize;

   SmallVector<int64_t> subgroupTileSizes(lastParallelDim + 1, 0);
   if (isBM) subgroupTileSizes[bIndex] = 1;
   subgroupTileSizes[mIndex] = coopMatSize->mTileCount * vectorSizes[mIndex];
   subgroupTileSizes[nIndex] = coopMatSize->nTileCount * vectorSizes[nIndex];

   SmallVector<int64_t> workgroupTileSizes(lastParallelDim + 1, 0);
   if (isBM) workgroupTileSizes[bIndex] = 1;
   workgroupTileSizes[mIndex] =
       coopMatSize->mWarpCount * subgroupTileSizes[mIndex];
   workgroupTileSizes[nIndex] =
       coopMatSize->nWarpCount * subgroupTileSizes[nIndex];

   // Also create one level for reduction. This is needed because of
   // SPIRVTileAndPromotePass requires it.
   // TODO(#10499): Consolidate tiling configuration across different pipelines.
   SmallVector<int64_t> reductionTileSizes;
   reductionTileSizes.append(kIndex, 0);
   reductionTileSizes.push_back(coopMatSize->kTileCount * coopMatSize->kSize);

   TileSizesListType tileSizes;
   tileSizes.reserve(3);
   tileSizes.push_back(workgroupTileSizes);
   tileSizes.push_back(subgroupTileSizes);
   tileSizes.push_back(reductionTileSizes);
   tileSizes.push_back(vectorSizes);

   return setOpConfigAndEntryPointFnTranslation(
       op->getParentOfType<func::FuncOp>(), op, tileSizes, pipeline,
       workgroupSize);
 }

 }  // namespace detail

 //===----------------------------------------------------------------------===//
 // FFT Default Configuration
 //===----------------------------------------------------------------------===//

 static LogicalResult setFftOpConfig(spirv::ResourceLimitsAttr limits,
                                     IREE::LinalgExt::FftOp op) {
   LLVM_DEBUG(llvm::dbgs() << "trying to deduce config as fft...\n");
   const int subgroupSize = limits.getSubgroupSize();
   auto pipeline = CodeGenPipeline::SPIRVBaseDistribute;

   std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1};

   SmallVector<utils::IteratorType> loopIteratorTypes =
       op.getLoopIteratorTypes();
   unsigned loopDepth = loopIteratorTypes.size();
   SmallVector<int64_t> workgroupTileSize(loopDepth, 0);

   // Tiling along partitioned loops with size 1.
   for (auto iteratorType : llvm::enumerate(loopIteratorTypes)) {
     if (iteratorType.value() == utils::IteratorType::parallel) {
       workgroupTileSize[iteratorType.index()] = 1;
     }
   }
   auto rank = op.getOperandRank();
   if (workgroupTileSize.size() >= rank && workgroupTileSize[rank - 1] != 0) {
     APInt value;
     if (matchPattern(op.getStage(), m_ConstantInt(&value))) {
       workgroupTileSize[rank - 1] = 1ll << value.getSExtValue();
     } else {
       op.emitError("non-constant stage might not work for fft op");
       return failure();
     }
   }
   TileSizesListType tileSizes = {workgroupTileSize};
   return setOpConfigAndEntryPointFnTranslation(
       op->getParentOfType<func::FuncOp>(), op, tileSizes, pipeline,
       workgroupSize);
 }

 //===----------------------------------------------------------------------===//
 // Reduction Default Configuration
 //===----------------------------------------------------------------------===//

 /// Set the configuration for reductions that can be mapped to warp reductions.
 static LogicalResult setReductionConfig(const spirv::TargetEnv &targetEnv,
                                         linalg::GenericOp op) {
   LLVM_DEBUG(llvm::dbgs() << "trying to deduce config as reduction...\n");
   if (op.hasDynamicShape()) return failure();
   // This pipeline eventually generates non-uniform group shuffle ops, which
   // requires special capability.
   if (!targetEnv.allows(spirv::Capability::GroupNonUniformShuffle))
     return failure();

   SmallVector<unsigned> reductionDims;
   op.getReductionDims(reductionDims);
   if (reductionDims.size() != 1 || reductionDims[0] != op.getNumLoops() - 1)
     return failure();
   if (op.getRegionOutputArgs().size() != 1) return failure();

   // Only support projected permutation for now. This could be extended to
   // projected permutated with broadcast.
   if (llvm::any_of(op.getDpsInputOperands(), [&](OpOperand *input) {
         return !op.getMatchingIndexingMap(input).isProjectedPermutation();
       })) {
     return failure();
   }

   // Only support single combiner operations for now.
   SmallVector<Operation *, 4> combinerOps;
   if (!matchReduction(op.getRegionOutputArgs(), 0, combinerOps) ||
       combinerOps.size() != 1) {
     return failure();
   }

   const int subgroupSize = targetEnv.getResourceLimits().getSubgroupSize();
   Optional<int64_t> dimSize = op.getStaticLoopRanges()[reductionDims[0]];
   if (!dimSize || *dimSize % subgroupSize != 0) return failure();

   const Type elementType =
       op.getOutputs()[0].getType().cast<ShapedType>().getElementType();
   if (!elementType.isIntOrFloat()) return failure();
   unsigned bitWidth = elementType.getIntOrFloatBitWidth();
   // Reduction distribution only supports 32-bit types now.
   if (bitWidth != 32) return failure();

   // Let each thread handle `vectorSize` elements.
   unsigned vectorSize = kMaxVectorNumBits / bitWidth;
   while ((*dimSize / vectorSize) % subgroupSize != 0) vectorSize /= 2;

   // TODO: Add reduction tiling to handle larger reductions.
   const int64_t maxWorkgroupSize =
       targetEnv.getResourceLimits().getMaxComputeWorkgroupInvocations();
   int64_t groupSize = *dimSize / vectorSize;
   if (groupSize > maxWorkgroupSize) {
     groupSize = llvm::APIntOps::GreatestCommonDivisor(
                     {64, uint64_t(groupSize)}, {64, uint64_t(maxWorkgroupSize)})
                     .getZExtValue();
   }
   // Current warp reduction pattern is a two step butterfly warp reduce.
   // First, do warp reductions along multiple subgroups.
   // Second, reduce results from multiple subgroups using single warp reduce.
   // The final warp reduce requires numSubgroupUsed > subgroupSize to work.
   // TODO(raikonenfnu): Add flexible num of warp reduce to handle more configs.
   // TT::CPU and TT::ARM_Valhall is not going through warp reduce.
   const int64_t numSubgroupsUsed = groupSize / subgroupSize;
   if (numSubgroupsUsed > subgroupSize) {
     return failure();
   }
   std::array<int64_t, 3> workgroupSize = {groupSize, 1, 1};
   // Tile all the parallel dimension to 1.
   SmallVector<unsigned> partitionedLoops =
       cast<PartitionableLoopsInterface>(op.getOperation())
           .getPartitionableLoops(kNumMaxParallelDims);
   llvm::SmallDenseSet<unsigned, 4> partitionedLoopsSet;
   partitionedLoopsSet.insert(partitionedLoops.begin(), partitionedLoops.end());
   size_t numLoops = partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1;
   SmallVector<int64_t, 4> workgroupTileSizes(numLoops, 1);
   SmallVector<int64_t, 4> reductionTileSizes(numLoops, 0);
   reductionTileSizes.push_back(groupSize * vectorSize);

   TileSizesListType tileSizes;
   tileSizes.emplace_back(std::move(workgroupTileSizes));  // Workgroup level
   tileSizes.emplace_back(std::move(reductionTileSizes));  // reduction level

   return setOpConfigAndEntryPointFnTranslation(
       op->getParentOfType<func::FuncOp>(), op, tileSizes,
       CodeGenPipeline::SPIRVSubgroupReduce, workgroupSize);
 }

 //===----------------------------------------------------------------------===//
 // Everything Default Configuration
 //===----------------------------------------------------------------------===//

 static LogicalResult setDefaultOpConfig(spirv::ResourceLimitsAttr limits,
                                         Operation *op,
                                         bool allowVectorization = true) {
   LLVM_DEBUG(llvm::dbgs() << "trying to deduce as default op...\n");
   func::FuncOp funcOp = op->getParentOfType<func::FuncOp>();
   auto interfaceOp = cast<PartitionableLoopsInterface>(*op);
   auto partitionedLoops =
       interfaceOp.getPartitionableLoops(kNumMaxParallelDims);

   // Special case for not tiled ops.
   if (partitionedLoops.empty()) {
     // No tiled loops means we cannot tile (and distribute) at all. Use just one
     // single thread to run everything.
     auto pipeline = CodeGenPipeline::SPIRVBaseDistribute;
     std::array<int64_t, 3> workgroupSize = {1, 1, 1};
     return setOpConfigAndEntryPointFnTranslation(funcOp, op, {}, pipeline,
                                                  workgroupSize);
   }

   const int subgroupSize = limits.getSubgroupSize();
   const unsigned loopDepth = partitionedLoops.back() + 1;

   // Configurations we need to decide.
   std::array<int64_t, 3> workgroupSize;
   SmallVector<int64_t> workgroupTileSizes;
   SmallVector<int64_t> threadTileSizes;

   // Initialize the configuration.
   auto initConfiguration = [&]() {
     workgroupSize = {subgroupSize, 1, 1};
     workgroupTileSizes.resize(loopDepth, 0);
     threadTileSizes.resize(loopDepth, 0);

     // Initialize tiling along all partitioned loops with size 1.
     for (int64_t loopIndex : partitionedLoops) {
       workgroupTileSizes[loopIndex] = threadTileSizes[loopIndex] = 1;
     }
     // Override the innermost dimension to distribute to threads in a subgroup.
     workgroupTileSizes.back() = subgroupSize;
     threadTileSizes.back() = 1;
   };

   // Special case for non-linalg ops.
   auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
   if (!linalgOp || linalgOp.getNumDpsInits() != 1) {
     auto pipeline = CodeGenPipeline::SPIRVBaseDistribute;

     initConfiguration();
     TileSizesListType tileSizes;
     tileSizes.push_back(workgroupTileSizes);
     tileSizes.push_back(threadTileSizes);

     return setOpConfigAndEntryPointFnTranslation(funcOp, op, tileSizes,
                                                  pipeline, workgroupSize);
   }

   // Common case for all linalg ops.

   // The core idea is to distribute the partitioned loops to the workgroup
   // dimensions. The goal is to fill up the GPU as much as possible, which means
   // 1) distributing to as many threads as possible, and 2) avoid assigning too
   // many threads to handle out-of-bound elements (thus idle).

   // Returns true if the given `operand` has 32-bit element type.
   auto has32BitElementType = [](Value operand) {
     auto shapedType = operand.getType().dyn_cast<ShapedType>();
     Type elementType =
         (shapedType ? shapedType.getElementType() : operand.getType());
     return elementType.isa<FloatType>() || elementType.isInteger(32);
   };

   // Whether we can try to use the vectorization pipeline.
   SmallVector<int64_t, 4> loopBounds = linalgOp.getStaticLoopRanges();
   bool vectorizable =
       allowVectorization &&
       // The vectorization pipeline assumes tensor semantics for tiling.
       linalgOp.hasTensorSemantics() && !linalgOp.hasIndexSemantics() &&
       // Require all affine maps to be projected permutation so that we can
       // generate vector transfer ops.
       llvm::all_of(
           linalgOp.getIndexingMapsArray(),
           [](AffineMap map) { return map.isProjectedPermutation(); }) &&
       // TODO: Fix non-32-bit element type vectorization and remove this.
       llvm::all_of(linalgOp->getOperands(), has32BitElementType) &&
       llvm::none_of(loopBounds, ShapedType::isDynamic);

   // Distribute workload to the given `numThreads` by allowing a potental loss.
   auto distributeToThreads = [&](int64_t numThreads,
                                  Optional<int64_t> lossFactor = llvm::None) {
     LLVM_DEBUG(llvm::dbgs() << "\nLoss factor: " << lossFactor << "\n");
     initConfiguration();

     // Scan from the innermost shape dimension and try to deduce the
     // configuration for the corresponding GPU workgroup dimension.
     int64_t wgDim = 0;
     for (auto shapeDim : llvm::reverse(partitionedLoops)) {
       int64_t loopBound = loopBounds[shapeDim];
       // Skip dynamic dimensions.
       if (ShapedType::isDynamic(loopBound)) continue;

       // Try to find some power of two that can devide the current shape dim
       // size. This vector keeps the candidate tile sizes.
       SmallVector<int64_t, 8> candidates;

       // For the inner most workgroup dim, try to see if we can have 4
       // elements per thread. This enables vectorization.
       if (vectorizable && wgDim == 0 && !lossFactor) {
         candidates.push_back(4 * numThreads);
       }
       // Try all power of two numbers upto the subgroup size.
       for (unsigned i = numThreads; i >= 1; i >>= 1) {
         candidates.push_back(i);
       }
       LLVM_DEBUG({
         llvm::dbgs() << "Candidate tile sizes: [";
         llvm::interleaveComma(candidates, llvm::dbgs());
         llvm::dbgs() << "]\n";
       });

       for (int64_t candidate : candidates) {
         if (loopBound % candidate != 0) {
           if (!lossFactor) continue;
           // Skip this candidate if it causes many threads to be idle.
           int64_t idleThreads = candidate - (loopBound % candidate);
           if (idleThreads > candidate / *lossFactor) continue;
         }
         // If the workload is too small and we cannot distribute to more than 2
         // workgroups, try a smaller tile size to increase parallelism.
         if (partitionedLoops.size() == 1 && candidate > subgroupSize &&
             llvm::divideCeil(loopBound, candidate) <= 2) {
           continue;
         }

         // Found a suitable candidate. Try to let each thread handle 4
         // elements if this is the workgroup x dimension.
         workgroupTileSizes[shapeDim] = candidate;
         LLVM_DEBUG(llvm::dbgs() << "Chosen tile size: " << candidate << "\n");
         if (vectorizable && wgDim == 0 && !lossFactor && candidate % 4 == 0) {
           // Use size-1 vectors to increase parallelism if larger ones causes
           // idle threads in the subgroup.
           bool hasIdleThreads =
               partitionedLoops.size() == 1 && candidate <= subgroupSize;
           int vectorSize = hasIdleThreads ? 1 : 4;
           LLVM_DEBUG(llvm::dbgs() << "Use vector size: " << vectorSize << "\n");
           threadTileSizes[shapeDim] = vectorSize;
           workgroupSize[wgDim] = candidate / vectorSize;
           assert(numThreads % (candidate / vectorSize) == 0);
           numThreads /= candidate / vectorSize;
         } else {
           if (wgDim == 0) vectorizable = false;
           threadTileSizes[shapeDim] = 1;
           workgroupSize[wgDim] = candidate;
           assert(numThreads % candidate == 0);
           numThreads /= candidate;
         }
         assert(numThreads >= 1);
         break;
       }

       // Stop if we have distributed all threads.
       if (numThreads == 1) break;
       wgDim++;
     }
     return numThreads;
   };

   // First try to see if we can use up all threads without any loss.
   if (distributeToThreads(subgroupSize) != 1) {
     // Otherwise, allow larger and larger loss factor.

     // Threads for distribution. Use 32 at least.
     int64_t numThreads = std::max(subgroupSize, 32);
     // We can tolerate (1 / lossFactor) of threads in the workgroup to be idle.
     int64_t lossFactor = 32;

     for (; lossFactor >= 1; lossFactor >>= 1) {
       if (distributeToThreads(numThreads, lossFactor) == 1) break;
     }
   }

   auto pipeline = vectorizable ? CodeGenPipeline::SPIRVBaseVectorize
                                : CodeGenPipeline::SPIRVBaseDistribute;

   TileSizesListType tileSizes;
   tileSizes.push_back(workgroupTileSizes);
   tileSizes.push_back(threadTileSizes);

   if (vectorizable) {
     // Try to tile all reductions by size 4 if possible. This gives us a chance
     // to perform vector4 load if an input has its innnermost dimension being
     // reduction. It also avoids generating too many instructions when unrolling
     // vector later. Similarly, also try to tile other untiled parallel
     // dimensions by 4 to avoid instruction bloat.
     SmallVector<int64_t> loopTileSizes(linalgOp.getNumLoops(), 0);
     for (const auto &it : llvm::enumerate(linalgOp.getIteratorTypesArray())) {
       auto i = it.index();
       if (loopBounds[i] % 4 != 0) continue;
       if (linalg::isReductionIterator(it.value()) ||
           workgroupTileSizes[i] == 0) {
         loopTileSizes[it.index()] = 4;
       }
     }
     if (llvm::any_of(loopTileSizes, [](int64_t s) { return s != 0; })) {
       tileSizes.push_back(loopTileSizes);
     }
   }

   return setOpConfigAndEntryPointFnTranslation(funcOp, op, tileSizes, pipeline,
                                                workgroupSize);
 }

 //===----------------------------------------------------------------------===//
 // Configuration Dispatcher
 //===----------------------------------------------------------------------===//

 /// Sets the CodeGen configuration as attributes to the given `rootOp` if it's a
 /// known Linalg matmul/convolution op with good configurations.
 static LogicalResult setSPIRVOpConfig(const spirv::TargetEnv &targetEnv,
                                       func::FuncOp entryPointFn,
                                       Operation *rootOp) {
   if (IREE::Codegen::CompilationInfoAttr compilationInfo =
           getCompilationInfo(rootOp)) {
     // If the op already has a lowering configuration specified from the
     // original source by the user, then use it directly.
     return setUserConfig(entryPointFn, rootOp, compilationInfo);
   }

   LogicalResult result = success();
   // First try to find a proper CodeGen configuration to tile and vectorize for
   // the current target architecture.
   switch (targetEnv.getVendorID()) {
     case spirv::Vendor::AMD:
       result = detail::setAMDCodeGenConfig(targetEnv, rootOp);
       break;
     case spirv::Vendor::Apple:
       result = detail::setAppleCodeGenConfig(targetEnv, rootOp);
       break;
     case spirv::Vendor::ARM:
       result = detail::setMaliCodeGenConfig(targetEnv, rootOp);
       break;
     case spirv::Vendor::NVIDIA:
       result = detail::setNVIDIACodeGenConfig(targetEnv, rootOp);
       break;
     case spirv::Vendor::Qualcomm:
       result = detail::setAdrenoCodeGenConfig(targetEnv, rootOp);
       break;
     default:
       break;
   }

   if (failed(result)) return result;
   // Check whether there is actually a configuration found. If so, it's done.
   if (getLoweringConfig(rootOp)) return result;

   // Otherwise fallback to use a default configuration that tiles and
   // distributes/vectorizes.
   spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
   return TypeSwitch<Operation *, LogicalResult>(rootOp)
       .Case<linalg::BatchMatmulOp, linalg::MatmulOp>([limits](auto op) {
         // Try to tile and vectorize first. It's common to see 32 threads
         // per subgroup for GPUs.
         std::array<int64_t, 2> workgroupXY = {32, 2};
         std::array<int64_t, 3> threadMNK;
         auto inputType =
             op.getInputs()[0].getType().template cast<ShapedType>();
         if (inputType.getElementType().getIntOrFloatBitWidth() == 16) {
           threadMNK = {8, 8, 8};
         } else {
           threadMNK = {8, 8, 4};
         }
         auto result =
             detail::setMatmulOpConfig(limits, op, workgroupXY, threadMNK);
         if (failed(result)) return result;
         if (getLoweringConfig(op)) return result;

         // If unsuccessful, try to tile and distribute.
         return setDefaultOpConfig(limits, op);
       })
       .Case<linalg::Conv2DNchwFchwOp, linalg::Conv2DNhwcHwcfOp,
             linalg::DepthwiseConv2DNhwcHwcOp>([limits](auto op) {
         // Try to tile and vectorize first. It's common to see 32 threads
         // per subgroup for GPUs.
         auto result = detail::setConvOpConfig(op, /*subgroupSize=*/32,
                                               /*bestTilingFactor=*/32);
         if (failed(result)) return result;
         if (getLoweringConfig(op)) return result;

         // If unsuccessful, try to tile and distribute.
         return setDefaultOpConfig(limits, op);
       })
       .Case<linalg::ConvolutionOpInterface>([limits](auto op) {
         // Other convolution/pooling op vectorization is not wired up.
         return setDefaultOpConfig(limits, op, /*allowVectorization=*/false);
       })
       .Case<linalg::GenericOp>([&](linalg::GenericOp op) {
         LLVM_DEBUG(llvm::dbgs() << "figuring configuration for generic op\n");
         if (succeeded(setReductionConfig(targetEnv, op))) return success();

         // If a generic op has reduction iterator types, it can be treated as a
         // root op for configuration as well. Use the default configuration,
         // which will mark it as a root.
         if (op.getNumLoops() != op.getNumParallelLoops()) {
           return setDefaultOpConfig(limits, op);
         }
         return success();
       })
       .Case<IREE::LinalgExt::FftOp>([limits](IREE::LinalgExt::FftOp op) {
         return setFftOpConfig(limits, op);
       })
       .Default([](Operation *) { return success(); });
 };

 //===----------------------------------------------------------------------===//
 // Entry Point
 //===----------------------------------------------------------------------===//

 LogicalResult initSPIRVLaunchConfig(ModuleOp module) {
   llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps =
       getAllEntryPoints(module);
   spirv::TargetEnvAttr targetEnvAttr = getSPIRVTargetEnvAttr(module);
   if (!targetEnvAttr) {
     return module.emitOpError(
         "expected parent hal.executable.variant to have spirv.target_env "
         "attribute");
   }
   spirv::TargetEnv targetEnv(targetEnvAttr);
   spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();

   for (auto funcOp : module.getOps<func::FuncOp>()) {
     auto exportOp = exportOps.lookup(funcOp.getName());
     if (!exportOp) continue;

     SmallVector<Operation *> computeOps;
     if (failed(getComputeOps(funcOp, computeOps))) {
       return funcOp.emitOpError("failed to get compute ops");
     }

     if (computeOps.empty()) {
       return funcOp.emitOpError(
           "unhandled translation of function without compute ops");
     }

     Operation *rootOperation = nullptr;
     // Try to find a configuration according to a matmul/convolution op and use
     // it as the root op.
     for (Operation *computeOp : computeOps) {
       if (failed(setSPIRVOpConfig(targetEnv, funcOp, computeOp)))
         return failure();

       // Check if the op configuration was set.
       if (!getLoweringConfig(computeOp)) continue;

       if (rootOperation) {
         return computeOp->emitOpError(
             "unhandled multiple roots in dispatch region");
       }
       rootOperation = computeOp;
     }

     if (!rootOperation) {
       // If there are still no root op, check for any linalg.generic op.
       Operation *computeOp = computeOps.back();
       if (failed(setDefaultOpConfig(limits, computeOp))) return failure();

       // Check if the op configuration was set.
       if (!getLoweringConfig(computeOp)) {
         return computeOp->emitOpError(
             "without known roots, the last compute operation in the tiled "
             "loop body is expected to be set as root");
       }
       rootOperation = computeOp;
     }
   }
   return success();
 }

 }  // namespace iree_compiler
 }  // namespace mlir