iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2020 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //===- KernelDispatchUtils.cpp - Utilities for generating dispatch info ---===//
 //
 // This file defines utility functions that can be used to get the information
 // about tile sizes to use to partition work across workgroups, the workgroup
 // sizes and to create information the dispatch on the host side needs to
 // execute an entry point function (e.g. total number of workgroups).
 //
 //===----------------------------------------------------------------------===//

 #include "iree/compiler/Codegen/SPIRV/KernelDispatchUtils.h"

 #include "iree/compiler/Codegen/Passes.h"
 #include "iree/compiler/Codegen/SPIRV/LaunchConfig.h"
 #include "iree/compiler/Codegen/SPIRV/Utils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/Shape/IR/ShapeOps.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Vector/VectorTransforms.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"

 #define DEBUG_TYPE "kernel-dispatch-utils"

 namespace mlir {
 namespace iree_compiler {

 /// Given `nprocs` try to distribute it evenly across 2 logical x and y.
 static std::tuple<int64_t, int64_t> distributeProcs2D(int64_t nprocs) {
   int64_t nprocs_x = std::max<int64_t>(
       1, static_cast<int64_t>(
              llvm::PowerOf2Ceil(static_cast<uint64_t>(std::sqrt(nprocs)))));
   return std::make_tuple(nprocs_x, nprocs / nprocs_x);
 }

 /// Returns the minimum of `shape` and `tileSize` if shape is static. If `shape`
 /// is dynamic returns `tileSize`.
 static int64_t getMinIfShapeStatic(int64_t shape, int64_t tileSize) {
   if (shape == ShapedType::kDynamicSize) return tileSize;
   return std::min(shape, tileSize);
 }

 /// Sets the `tileSizes` and `workgroupSize` for an Linalg `op` to the default,
 /// where at most 3 inner parallel dimensions of `op` are tiled and distributed,
 /// and each invocation handles one scalar elements.
 // TODO(#5852): revisit the default here: they were chosen to get started and
 // not very good.
 static LogicalResult setDefaultTilingScheme(
     const spirv::TargetEnv &targetEnv, linalg::LinalgOp op,
     TileSizesListType &tileSizes, std::array<int64_t, 3> &workgroupSize) {
   auto maxWorkgroupSize =
       targetEnv.getResourceLimits().max_compute_workgroup_invocations();

   const int64_t tileSizeX = 32;
   const int64_t tileSizeY = maxWorkgroupSize.getInt() / tileSizeX;

   unsigned numParallelDims = getNumOuterParallelLoops(op);

   SmallVector<int64_t, 4> workgroupLevel(numParallelDims, 0);
   SmallVector<int64_t, 4> invocationLevel(numParallelDims, 0);

   if (numParallelDims >= 1) {
     workgroupLevel.back() = tileSizeX;
     invocationLevel.back() = 1;
   }
   if (numParallelDims >= 2) {
     workgroupLevel[numParallelDims - 2] = tileSizeY;
     invocationLevel[numParallelDims - 2] = 1;
   }
   if (numParallelDims >= 3) {
     workgroupLevel[numParallelDims - 3] = 1;
     invocationLevel[numParallelDims - 3] = 1;
   }

   tileSizes.emplace_back(std::move(workgroupLevel));
   tileSizes.emplace_back();  // Subgroup level
   tileSizes.emplace_back(std::move(invocationLevel));

   workgroupSize = {tileSizeX, tileSizeY, 1};

   return success();
 }

 /// Fills `inputTypes` and `outputTypes` with the original input/output types
 /// for all tiles for `op`.
 static std::tuple<SmallVector<ShapedType>, SmallVector<ShapedType>>
 getInputOutputTypes(linalg::LinalgOp op) {
   SmallVector<ShapedType> inputTypes(op.getNumInputs()),
       outputTypes(op.getNumOutputs());
   auto inputOperands = op.getInputOperands();
   for (auto operand : enumerate(inputOperands)) {
     assert(!op.isScalar(operand.value()));
     inputTypes[operand.index()] =
         getUntiledType(operand.value()->get()).dyn_cast<ShapedType>();
   }
   auto outputOperands = op.getOutputOperands();
   for (auto operand : enumerate(outputOperands)) {
     outputTypes[operand.index()] =
         getUntiledType(operand.value()->get()).dyn_cast<ShapedType>();
   }
   return std::make_tuple(std::move(inputTypes), std::move(outputTypes));
 }

 namespace {
 struct LaunchConfigInfo {
   std::array<int64_t, 3> workgroupSize = {32, 1, 1};
   std::array<int64_t, 3> numSubgroups = {1, 1, 1};
   bool vectorize = false;
 };

 struct TileWorkgroupSizePair {
   // How many scalar elements each workgroup should handle along each dimension.
   std::array<int64_t, 3> tileSize;
   std::array<int64_t, 3> workgroupSize;
 };
 }  // namespace

 /// For a given operation `op`, compute the following configurations according
 /// to SPIR-V `targetEnv` and `options`:
 /// 1) number of tiling levels and tile sizes to use (updates `tileSizes`),
 /// 2) workgroup size to use (updates `workgroupSize`),
 /// 3) number of subgroups to use if two level tiling is used (updates
 ///    `numSubgroups`).
 template <typename T>
 static LogicalResult getOpLaunchConfig(T op, const spirv::TargetEnv &targetEnv,
                                        const SPIRVCodegenOptions &options,
                                        TileSizesListType &tileSizes,
                                        LaunchConfigInfo &config) {
   return setDefaultTilingScheme(targetEnv, op, tileSizes, config.workgroupSize);
 }

 static void getMaliBestMatMulTileSizes(
     Type elementType, SmallVectorImpl<TileWorkgroupSizePair> &tileSizes,
     int64_t dstSize) {
   if (elementType.isF16()) {
     const int64_t smallMatrixSizeThreshold = 512 * 512;
     // For smaller destination size we cannot fill out the GPU with bigger tile
     // sizes. Instead we pick smaller tiles along M and N to increase the number
     // of workgroups and a larger K tile size since we have lower pressure and
     // need extra instructions to hide latency.
     // TODO: The threshold needs to be fine tuned by doing exploration based on
     // matrix shapes.
     if (dstSize <= smallMatrixSizeThreshold) {
       tileSizes.push_back(TileWorkgroupSizePair({{16, 32, 16}, {8, 2, 1}}));
     } else {
       tileSizes.push_back(TileWorkgroupSizePair({{16, 64, 4}, {8, 2, 1}}));
       tileSizes.push_back(TileWorkgroupSizePair({{8, 128, 4}, {8, 2, 1}}));
       tileSizes.push_back(TileWorkgroupSizePair({{16, 32, 4}, {8, 2, 1}}));
     }
   } else {
     // TODO: Heuristic picked based on MobileNet performance. We need
     // auto-tuning to be able to make a smarter choice.
     const int64_t smallMatrixSizeThreshold = 20000;
     if (dstSize <= smallMatrixSizeThreshold) {
       tileSizes.push_back(TileWorkgroupSizePair({{4, 32, 16}, {8, 2, 1}}));
     }
     tileSizes.push_back(TileWorkgroupSizePair({{12, 32, 4}, {8, 2, 1}}));
     tileSizes.push_back(TileWorkgroupSizePair({{14, 32, 4}, {8, 2, 1}}));
     tileSizes.push_back(TileWorkgroupSizePair({{10, 32, 4}, {8, 2, 1}}));
     tileSizes.push_back(TileWorkgroupSizePair({{7, 64, 4}, {16, 1, 1}}));
     tileSizes.push_back(TileWorkgroupSizePair({{8, 32, 4}, {8, 2, 1}}));
     tileSizes.push_back(TileWorkgroupSizePair({{6, 32, 4}, {8, 2, 1}}));
     tileSizes.push_back(TileWorkgroupSizePair({{24, 16, 4}, {2, 8, 1}}));
     tileSizes.push_back(TileWorkgroupSizePair({{16, 16, 4}, {2, 8, 1}}));
     tileSizes.push_back(TileWorkgroupSizePair({{24, 8, 4}, {2, 8, 1}}));
     tileSizes.push_back(TileWorkgroupSizePair({{40, 8, 4}, {2, 8, 1}}));
     tileSizes.push_back(TileWorkgroupSizePair({{32, 8, 4}, {2, 8, 1}}));
     tileSizes.push_back(TileWorkgroupSizePair({{16, 8, 4}, {2, 8, 1}}));
     tileSizes.push_back(TileWorkgroupSizePair({{1, 32, 16}, {8, 1, 1}}));
     tileSizes.push_back(TileWorkgroupSizePair({{1, 32, 8}, {8, 1, 1}}));
     tileSizes.push_back(TileWorkgroupSizePair({{1, 32, 4}, {8, 1, 1}}));
   }
 }

 /// Launch configuration for Mali GPU configuration.
 static LogicalResult getMaliSpecificConfig(
     linalg::BatchMatmulOp op, const spirv::TargetEnv &targetEnv,
     const SPIRVCodegenOptions &options, TileSizesListType &tileSizes,
     std::array<int64_t, 3> &workgroupSize,
     std::array<int64_t, 3> &numSubgroups) {
   if (targetEnv.getVendorID() != spirv::Vendor::ARM) return failure();

   SmallVector<ShapedType> inputTypes, outputTypes;
   std::tie(inputTypes, outputTypes) = getInputOutputTypes(op);

   ShapedType lhsType = inputTypes[0], rhsType = inputTypes[1];
   if (!lhsType || !rhsType || !lhsType.hasStaticShape() ||
       !rhsType.hasStaticShape())
     return failure();
   // Get a vector of best tile size ordered from best to worst.
   SmallVector<TileWorkgroupSizePair, 4> workgroupLevelTs;
   int64_t dstSize =
       lhsType.getDimSize(0) * lhsType.getDimSize(1) * rhsType.getDimSize(2);
   getMaliBestMatMulTileSizes(lhsType.getElementType(), workgroupLevelTs,
                              dstSize);
   for (TileWorkgroupSizePair pair : workgroupLevelTs) {
     if (lhsType.getDimSize(1) % pair.tileSize[0] != 0 ||
         rhsType.getDimSize(2) % pair.tileSize[1] != 0 ||
         lhsType.getDimSize(2) % pair.tileSize[2] != 0) {
       continue;
     }

     workgroupSize = pair.workgroupSize;
     SmallVector<int64_t, 4> batchTs;
     batchTs.append({1, pair.tileSize[0], pair.tileSize[1], pair.tileSize[2]});
     tileSizes.emplace_back(batchTs);
     // No tiling at the subgroup level since this target doesn't use subgroup op
     // or shared memory.
     tileSizes.emplace_back();
     SmallVector<int64_t, 4> invocationLevelTs = {
         batchTs[0], batchTs[1] / workgroupSize[1],
         batchTs[2] / workgroupSize[0], batchTs[3]};
     tileSizes.emplace_back(invocationLevelTs);
     return success();
   }
   return failure();
 }

 /// Launch config for `linalg.batchmatmul`.
 template <>
 LogicalResult getOpLaunchConfig(linalg::BatchMatmulOp op,
                                 const spirv::TargetEnv &targetEnv,
                                 const SPIRVCodegenOptions &options,
                                 TileSizesListType &tileSizes,
                                 LaunchConfigInfo &config) {
   if (succeeded(getMaliSpecificConfig(op, targetEnv, options, tileSizes,
                                       config.workgroupSize,
                                       config.numSubgroups))) {
     config.vectorize = true;
     return success();
   }
   unsigned maxWorkgroupSize = targetEnv.getResourceLimits()
                                   .max_compute_workgroup_invocations()
                                   .getInt();
   std::tie(config.workgroupSize[0], config.workgroupSize[1]) =
       distributeProcs2D(maxWorkgroupSize);
   config.workgroupSize[2] = 1;
   // This is just being hard-wired for now to be minimal viable, but this can be
   // decided better when we have better estimates of device charecteristics.
   const int64_t nRowsPerWorkitem = 1;
   const int64_t nColsPerWorkitem = 1;
   const int64_t nBatchesPerWorkitem = 1;
   int64_t tileSizeK = 0;
   if (options.useWorkgroupMemory) {
     // This number should be decided based on the amount of shared memory
     // available (maybe). For now, just hard-wire it.
     tileSizeK = 32;
   }
   SmallVector<int64_t, 4> workgroupLevel = {
       nBatchesPerWorkitem, nRowsPerWorkitem * config.workgroupSize[1],
       nColsPerWorkitem * config.workgroupSize[0], tileSizeK};
   SmallVector<int64_t, 4> invocationLevel = {
       nBatchesPerWorkitem, nRowsPerWorkitem, nColsPerWorkitem, 0};

   tileSizes.emplace_back(std::move(workgroupLevel));
   tileSizes.emplace_back();  // subgroup level
   tileSizes.emplace_back(std::move(invocationLevel));
   return success();
 }

 /// Returns the size of the co-operative matrix multiply operations on the
 /// device.
 static Optional<SmallVector<int64_t, 4>> getCooperativeMatmulSubgroupSize(
     spirv::ResourceLimitsAttr resourceLimits, Type lhsType, Type rhsType,
     Type initType, Type resultType) {
   for (auto coopMatmulProperties :
        resourceLimits.cooperative_matrix_properties_nv()
            .getAsRange<spirv::CooperativeMatrixPropertiesNVAttr>()) {
     if (coopMatmulProperties.a_type().getValue() == lhsType &&
         coopMatmulProperties.b_type().getValue() == rhsType &&
         coopMatmulProperties.c_type().getValue() == initType &&
         coopMatmulProperties.result_type().getValue() == resultType &&
         coopMatmulProperties.scope().getValue() == spirv::Scope::Subgroup) {
       return SmallVector<int64_t, 4>{
           coopMatmulProperties.m_size().getValue().getSExtValue(),
           coopMatmulProperties.n_size().getValue().getSExtValue(),
           coopMatmulProperties.k_size().getValue().getSExtValue()};
     }
   }
   return llvm::None;
 }

 /// Launch configuration for using spv.CooperativeMatrixMulAddNV
 /// operations. Needs two levels of tiling.
 static LogicalResult getConfigForCooperativeMatmul(
     linalg::MatmulOp op, const spirv::TargetEnv &targetEnv,
     const SPIRVCodegenOptions &options, TileSizesListType &tileSizes,
     std::array<int64_t, 3> &workgroupSize,
     std::array<int64_t, 3> &numSubgroups) {
   if (!targetEnv.allows(spirv::Capability::CooperativeMatrixNV) ||
       !targetEnv.allows(spirv::Extension::SPV_NV_cooperative_matrix))
     return failure();

   SmallVector<ShapedType> inputTypes, outputTypes;
   std::tie(inputTypes, outputTypes) = getInputOutputTypes(op);

   ShapedType lhsType = inputTypes[0], rhsType = inputTypes[1];
   ShapedType outputType = outputTypes[0];

   auto resourceLimits = targetEnv.getResourceLimits();
   Optional<SmallVector<int64_t, 4>> coopMatmulSize =
       getCooperativeMatmulSubgroupSize(
           resourceLimits, lhsType.getElementType(), rhsType.getElementType(),
           outputType.getElementType(), outputType.getElementType());
   if (!coopMatmulSize) return failure();

   // Check that the matmul sizes are a multiple of the tilesize.
   auto isMultipleOf = [](int64_t s, int64_t ts) {
     return !ShapedType::isDynamic(s) && (s % ts) == 0;
   };

   ArrayRef<int64_t> lhsShape = lhsType.getShape();
   ArrayRef<int64_t> rhsShape = rhsType.getShape();
   if (!isMultipleOf(lhsShape[0], (*coopMatmulSize)[0]) ||
       !isMultipleOf(rhsShape[1], (*coopMatmulSize)[1]) ||
       !isMultipleOf(lhsShape[1], (*coopMatmulSize)[2]) ||
       !isMultipleOf(rhsShape[0], (*coopMatmulSize)[2]))
     return failure();

   if (options.useWorkgroupMemory) {
     numSubgroups[0] = 2;
     numSubgroups[1] = 2;
   } else {
     numSubgroups[0] = 1;
     numSubgroups[1] = 1;
   }
   numSubgroups[2] = 1;

   // For now this is being hard-wired to be {4, 4, 2}. This can actually be set
   // to whatever, but ultimately depends on register pressure.
   const int64_t numVecMatmulPerSubgroupX = 4;
   const int64_t numVecMatmulPerSubgroupY = 4;
   const int64_t numVecMatmulPerSubgroupK = 2;
   SmallVector<int64_t, 4> ts = {
       numVecMatmulPerSubgroupY * (*coopMatmulSize)[0] * numSubgroups[1],
       numVecMatmulPerSubgroupX * (*coopMatmulSize)[1] * numSubgroups[0],
       numVecMatmulPerSubgroupK * (*coopMatmulSize)[2]};
   tileSizes.emplace_back(std::move(ts));

   int64_t subgroupSize =
       resourceLimits.subgroup_size().getValue().getSExtValue();
   workgroupSize[0] = numSubgroups[0] * numSubgroups[1] * subgroupSize;
   workgroupSize[1] = 1;
   workgroupSize[2] = 1;
   // Subgroup tile sizes
   SmallVector<int64_t, 4> subgroupTs = {
       numVecMatmulPerSubgroupY * (*coopMatmulSize)[0],
       numVecMatmulPerSubgroupX * (*coopMatmulSize)[1]};
   tileSizes.emplace_back(std::move(subgroupTs));
   return success();
 }

 /// Launch config for element-wise linalg.generic.
 LogicalResult getGenericOpLaunchConfig(linalg::LinalgOp linalgOp,
                                        const spirv::TargetEnv &targetEnv,
                                        const SPIRVCodegenOptions &options,
                                        TileSizesListType &tileSizes,
                                        LaunchConfigInfo &config) {
   // Skip vectorization for non-minor identity inputs as it generates
   // transfer_read ops with permutation maps that we currently cannot lower.
   // TODO: Remove this restriction once the lowering of the permutation map is
   // supported in core.
   bool vectorize = !linalgOp.hasIndexSemantics() &&
                    llvm::all_of(linalgOp.getIndexingMaps(), [](AffineMap &map) {
                      return map.isMinorIdentity();
                    });
   // TODO(thomasraoux): Lowering of integers other than i32 may require
   // emulation. This is currently not supported for vector operation. Re-enable
   // this when the bug is fixed on SPIR-V lowering side.
   if (llvm::any_of(linalgOp->getOperands(), [](Value operand) {
         Type memrefType = operand.getType().cast<MemRefType>().getElementType();
         return !memrefType.isa<FloatType>() && !memrefType.isInteger(32);
       }))
     vectorize = false;
   int64_t subgroupSize =
       targetEnv.getResourceLimits().subgroup_size().getValue().getSExtValue();
   config.workgroupSize[0] = subgroupSize;
   config.workgroupSize[1] = 1;
   config.workgroupSize[2] = 1;
   SmallVector<ShapedType> inputTypes, outputTypes;
   std::tie(inputTypes, outputTypes) = getInputOutputTypes(linalgOp);
   ShapedType outputShape = outputTypes[0];

   SmallVector<int64_t, 4> candidateTileSizes;
   // When Vectororization is not enabled we skil the second level of tiling and
   // fall back to convertToGPU which will map one element to one thread. To
   // avoid a mismatch in the number of workgroup dispatched, we pick a tile size
   // to have one element per thread.
   // TODO: Remove this once we switch to linalg on tensor path.
   if (vectorize) {
     candidateTileSizes.append({4 * subgroupSize, 2 * subgroupSize});
   }

   candidateTileSizes.push_back(subgroupSize);
   // Use the first tile size that can divide the shape. If the shape is not
   // aligned on any of the tile sizes pick the smallest tile of one element per
   // thread.
   int64_t lowerTs = config.workgroupSize[0];
   for (int64_t size : candidateTileSizes) {
     if (outputShape.getShape().back() % size != 0) continue;
     lowerTs = size;
     break;
   }
   unsigned numLoops = getNumOuterParallelLoops(linalgOp);
   SmallVector<int64_t, 4> ts(numLoops, 1);
   ts.back() = lowerTs;
   tileSizes.emplace_back(ts);  // Workgroup level
   tileSizes.emplace_back();    // Subgroup level

   if (!vectorize || outputShape.getShape().back() % lowerTs != 0) {
     ts.back() = 1;
     tileSizes.emplace_back(ts);  // Thread level
     config.vectorize = false;
   } else {
     ts.back() = lowerTs / subgroupSize;
     tileSizes.emplace_back(ts);  // Thread level
     // Vectorize only if we are processing more than one element per thread.
     config.vectorize = vectorize && (ts.back() > 1);
   }
   return success();
 }

 #define GET_GENERIC_OP_LAUNCH_CONFIG(opType)                            \
   template <>                                                           \
   LogicalResult getOpLaunchConfig(                                      \
       opType op, const spirv::TargetEnv &targetEnv,                     \
       const SPIRVCodegenOptions &options, TileSizesListType &tileSizes, \
       LaunchConfigInfo &config) {                                       \
     return getGenericOpLaunchConfig(op, targetEnv, options, tileSizes,  \
                                     config);                            \
   }

 GET_GENERIC_OP_LAUNCH_CONFIG(linalg::GenericOp)

 #undef GET_GENERIC_OP_LAUNCH_CONFIG

 /// Launch configuration for different known GPU configuration.
 static LogicalResult getTargetSpecificConfig(
     linalg::MatmulOp op, const spirv::TargetEnv &targetEnv,
     const SPIRVCodegenOptions &options, TileSizesListType &tileSizes,
     std::array<int64_t, 3> &workgroupSize,
     std::array<int64_t, 3> &numSubgroups) {
   if (targetEnv.getVendorID() != spirv::Vendor::ARM) return failure();

   SmallVector<ShapedType> inputTypes, outputTypes;
   std::tie(inputTypes, outputTypes) = getInputOutputTypes(op);

   ShapedType lhsType = inputTypes[0], rhsType = inputTypes[1];
   // If the shape size is unknonw fall back to none vectorized path.
   if (!lhsType || !rhsType || !lhsType.hasStaticShape() ||
       !rhsType.hasStaticShape())
     return failure();

   // Pick ideal tile size based on the type.
   SmallVector<TileWorkgroupSizePair, 4> workgroupLevelTs;
   int64_t dstSize = lhsType.getDimSize(0) * rhsType.getDimSize(1);
   getMaliBestMatMulTileSizes(lhsType.getElementType(), workgroupLevelTs,
                              dstSize);
   for (TileWorkgroupSizePair pair : workgroupLevelTs) {
     if (lhsType.getDimSize(0) % pair.tileSize[0] != 0 ||
         rhsType.getDimSize(1) % pair.tileSize[1] != 0 ||
         lhsType.getDimSize(1) % pair.tileSize[2] != 0) {
       continue;
     }

     workgroupSize = pair.workgroupSize;
     SmallVector<int64_t, 4> matmulTS(pair.tileSize.begin(),
                                      pair.tileSize.end());
     tileSizes.emplace_back(matmulTS);
     // No tiling at the subgroup level since this target doesn't use subgroup op
     // or shared memory.
     tileSizes.emplace_back();
     SmallVector<int64_t, 4> invocationLevelTs = {matmulTS[0] / workgroupSize[1],
                                                  matmulTS[1] / workgroupSize[0],
                                                  matmulTS[2]};
     tileSizes.emplace_back(invocationLevelTs);
     return success();
   }
   return failure();
 }

 template <>
 LogicalResult getOpLaunchConfig(linalg::MatmulOp op,
                                 const spirv::TargetEnv &targetEnv,
                                 const SPIRVCodegenOptions &options,
                                 TileSizesListType &tileSizes,
                                 LaunchConfigInfo &config) {
   if (succeeded(getConfigForCooperativeMatmul(op, targetEnv, options, tileSizes,
                                               config.workgroupSize,
                                               config.numSubgroups))) {
     config.vectorize = true;
     return success();
   }
   if (succeeded(getTargetSpecificConfig(op, targetEnv, options, tileSizes,
                                         config.workgroupSize,
                                         config.numSubgroups))) {
     config.vectorize = true;
     return success();
   }

   unsigned maxWorkgroupSize = targetEnv.getResourceLimits()
                                   .max_compute_workgroup_invocations()
                                   .getInt();
   std::tie(config.workgroupSize[0], config.workgroupSize[1]) =
       distributeProcs2D(maxWorkgroupSize);
   config.workgroupSize[2] = 1;
   const int nRowsPerWorkitem = 1;
   const int nColsPerWorkitem = 1;
   int64_t tileSizeK = 0;
   if (options.useWorkgroupMemory) {
     // TODO(#3131): This number should be decided based on the amount of shared
     // memory available (maybe). For now, just hard-wire it.
     tileSizeK = 32;
   }

   SmallVector<ShapedType> inputTypes;
   std::tie(inputTypes, std::ignore) = getInputOutputTypes(op);
   int64_t M = inputTypes[0].getShape()[0];
   int64_t N = inputTypes[1].getShape()[1];
   int64_t K = inputTypes[0].getShape()[1];

   SmallVector<int64_t, 4> workgroupLevel = {
       getMinIfShapeStatic(M, nRowsPerWorkitem * config.workgroupSize[1]),
       getMinIfShapeStatic(N, nColsPerWorkitem * config.workgroupSize[0]),
       getMinIfShapeStatic(K, tileSizeK)};
   SmallVector<int64_t, 4> invocationLevel = {1, 1, 0};

   tileSizes.emplace_back(std::move(workgroupLevel));
   tileSizes.emplace_back();  // subgroup level
   tileSizes.emplace_back(std::move(invocationLevel));
   return success();
 }

 static LogicalResult getMaliSpecificConfig(linalg::ConvInputNHWCFilterHWCFOp op,
                                            TileSizesListType &tileSizes,
                                            LaunchConfigInfo &config) {
   SmallVector<ShapedType> inputTypes, outputTypes;
   std::tie(inputTypes, outputTypes) = getInputOutputTypes(op);

   ShapedType inputType = inputTypes[0], outputType = outputTypes[0];
   if (!inputType || !outputType || !inputType.hasStaticShape() ||
       !outputType.hasStaticShape())
     return failure();

   bool isInputTilable =
       inputType.getDimSize(3) % 4 == 0 || inputType.getDimSize(3) < 4;
   if (!isInputTilable) return failure();

   // A list of preferred tile sizes and workgroup sizes. This is for Mali
   // G77 now and it's fairly ad-hoc. We need to have a better story for
   // incorporating such information.
   static const TileWorkgroupSizePair tileWorkgroupSizePairs[] = {
       {{4, 4, 16}, {4, 4, 1}},
       {{2, 2, 64}, {16, 1, 1}},
       {{4, 8, 8}, {2, 4, 2}},
       {{2, 2, 32}, {8, 2, 1}},
       {{1, 1, 32}, {8, 1, 1}}};

   for (const auto &pair : tileWorkgroupSizePairs) {
     const std::array<int64_t, 3> &tileSize = pair.tileSize;
     const std::array<int64_t, 3> &workgroupSize = pair.workgroupSize;

     auto outputShape = outputType.getShape();
     bool isOutputTilable = (outputShape[0] == 1) &&
                            (outputShape[1] % tileSize[0] == 0) &&
                            (outputShape[2] % tileSize[1] == 0) &&
                            (outputShape[3] % tileSize[2] == 0);
     if (!isOutputTilable) continue;

     SmallVector<int64_t, 4> workgroupLevel = {
         /*batch=*/0, /*output_height=*/tileSize[0],
         /*output_width=*/tileSize[1], /*output_channel=*/tileSize[2]};
     tileSizes.emplace_back(std::move(workgroupLevel));

     // No tiling at the subgroup level given that we don't use subgroup
     // level syncrhonization  or shared memory.
     tileSizes.emplace_back();

     SmallVector<int64_t, 4> invocationLevel = {
         /*batch=*/0, /*output_height=*/tileSize[0] / workgroupSize[2],
         /*output_width=*/tileSize[1] / workgroupSize[1],
         /*output_channel=*/tileSize[2] / workgroupSize[0]};
     tileSizes.emplace_back(invocationLevel);

     // Finally, for each invocation, we use tiling to generate loops to loop
     // over the filter's height (step 1), width (step 1), and input channel
     // (step 4) dimensions.
     SmallVector<int64_t, 4> fourthLevel = {0, 0, 0, 0, 1, 1, 4};
     tileSizes.emplace_back(fourthLevel);

     config.workgroupSize = workgroupSize;
     config.vectorize = true;

     return success();
   }

   return failure();
 }

 template <>
 LogicalResult getOpLaunchConfig(linalg::ConvInputNHWCFilterHWCFOp op,
                                 const spirv::TargetEnv &targetEnv,
                                 const SPIRVCodegenOptions &options,
                                 TileSizesListType &tileSizes,
                                 LaunchConfigInfo &config) {
   if (targetEnv.getVendorID() == spirv::Vendor::ARM &&
       succeeded(getMaliSpecificConfig(op, tileSizes, config))) {
     return success();
   }

   return setDefaultTilingScheme(targetEnv, op, tileSizes, config.workgroupSize);
 }

 static LogicalResult getMaliSpecificConfig(
     linalg::DepthwiseConvInputNHWCFilterHWCOp op, TileSizesListType &tileSizes,
     LaunchConfigInfo &config) {
   SmallVector<ShapedType> inputTypes, outputTypes;
   std::tie(inputTypes, outputTypes) = getInputOutputTypes(op);

   ShapedType inputType = inputTypes[0], outputType = outputTypes[0];
   if (!inputType || !outputType || !inputType.hasStaticShape() ||
       !outputType.hasStaticShape())
     return failure();

   // A list of preferred tile sizes and workgroup sizes. This is for Mali
   // G77 now and it's fairly ad-hoc. We need to have a better story for
   // incorporating such information.
   static const TileWorkgroupSizePair tileWorkgroupSizePairs[] = {
       {{2, 2, 32}, {8, 2, 2}},
       {{1, 4, 16}, {4, 4, 1}},
       {{1, 1, 64}, {16, 1, 1}},
   };

   for (const auto &pair : tileWorkgroupSizePairs) {
     const std::array<int64_t, 3> &tileSize = pair.tileSize;
     const std::array<int64_t, 3> &workgroupSize = pair.workgroupSize;

     auto outputShape = outputType.getShape();
     bool isOutputTilable = outputShape[0] == 1 &&
                            (outputShape[1] % tileSize[0] == 0) &&
                            (outputShape[2] % tileSize[1] == 0) &&
                            (outputShape[3] % tileSize[2] == 0);
     if (!isOutputTilable) continue;

     SmallVector<int64_t, 4> workgroupLevel = {/*batch=*/0,
                                               /*output_height=*/tileSize[0],
                                               /*output_width=*/tileSize[1],
                                               /*output_channel=*/tileSize[2]};
     tileSizes.emplace_back(std::move(workgroupLevel));

     // No tiling at the subgroup level given that we don't use subgroup
     // level syncrhonization  or shared memory.
     tileSizes.emplace_back();

     SmallVector<int64_t, 4> invocationLevel = {
         /*batch=*/0, /*output_height=*/tileSize[0] / workgroupSize[2],
         /*output_width=*/tileSize[1] / workgroupSize[1],
         /*output_channel=*/tileSize[2] / workgroupSize[0]};
     tileSizes.emplace_back(invocationLevel);

     // Finally, for each invocation, we use tiling to generate loops to loop
     // over the filter's height (step 1) and width (step 1) dimensions.
     SmallVector<int64_t, 4> fourthLevel = {0, 0, 0, 0, 1, 1};
     tileSizes.emplace_back(fourthLevel);

     config.workgroupSize = workgroupSize;
     config.vectorize = true;

     return success();
   }
   return failure();
 }

 template <>
 LogicalResult getOpLaunchConfig(linalg::DepthwiseConvInputNHWCFilterHWCOp op,
                                 const spirv::TargetEnv &targetEnv,
                                 const SPIRVCodegenOptions &options,
                                 TileSizesListType &tileSizes,
                                 LaunchConfigInfo &config) {
   if (targetEnv.getVendorID() == spirv::Vendor::ARM &&
       succeeded(getMaliSpecificConfig(op, tileSizes, config))) {
     return success();
   }

   return setDefaultTilingScheme(targetEnv, op, tileSizes, config.workgroupSize);
 }

 Optional<LaunchConfig> initGPULaunchConfig(
     MLIRContext *context, const linalg::LinalgDependenceGraph &dependenceGraph,
     const SPIRVCodegenOptions &options, ArrayRef<linalg::LinalgOp> linalgOps) {
   LaunchConfig launchConfig;
   if (!options.workgroupSize.empty()) {
     SmallVector<int64_t, 3> workgroupTileSizes(
         options.workgroupTileSizes.begin(), options.workgroupTileSizes.end());
     SmallVector<int64_t, 3> invocationTileSizes(
         options.invocationTileSizes.begin(), options.invocationTileSizes.end());
     for (linalg::LinalgOp linalgOp : linalgOps) {
       launchConfig.setTileSizes(linalgOp.getOperation(), workgroupTileSizes, 0);
       // Subgroup level.
       launchConfig.setTileSizes(linalgOp.getOperation(), {}, 1);
       // Invocation level.
       launchConfig.setTileSizes(linalgOp.getOperation(), invocationTileSizes,
                                 2);
       launchConfig.setVectorize(true);
     }
     SmallVector<int64_t, 3> workgroupSize(options.workgroupSize.begin(),
                                           options.workgroupSize.end());
     launchConfig.setWorkgroupSize(workgroupSize);
   }

   if (linalgOps.empty()) return launchConfig;

   spirv::TargetEnv targetEnv(spirv::lookupTargetEnv(*linalgOps.begin()));

   Optional<linalg::LinalgOp> rootOperation = {};
   LaunchConfigInfo config;
 #define DISPATCH(opName)                                                \
   if (auto op = dyn_cast<opName>(linalgOp.getOperation())) {            \
     rootOperation = linalgOp;                                           \
     if (launchConfig.hasTileSizes(linalgOp.getOperation())) break;      \
     TileSizesListType tileSizesInfo;                                    \
     if (failed(getOpLaunchConfig(op, targetEnv, options, tileSizesInfo, \
                                  config))) {                            \
       return llvm::None;                                                \
     }                                                                   \
     launchConfig.setTileSizes(op, tileSizesInfo);                       \
     break;                                                              \
   }

   for (linalg::LinalgOp linalgOp : linalgOps) {
     DISPATCH(linalg::BatchMatmulOp)
     DISPATCH(linalg::DepthwiseConvInputNHWCFilterHWCOp)
     DISPATCH(linalg::DepthwiseConvInputNHWCFilterHWCFOp)
     DISPATCH(linalg::ConvInputNWCFilterWCFOp)
     DISPATCH(linalg::ConvInputNHWCFilterHWCFOp)
     DISPATCH(linalg::ConvInputNDHWCFilterDHWCFOp)
     DISPATCH(linalg::MatmulOp)
     DISPATCH(linalg::PoolingNhwcMaxOp)
     DISPATCH(linalg::PoolingNhwcMinOp)
     DISPATCH(linalg::PoolingNhwcSumOp)
   }

   // Any generic operations found are made the root if no other op is the root
   if (!rootOperation) {
     for (linalg::LinalgOp linalgOp : reverse(linalgOps)) {
       size_t numLoops = getNumOuterParallelLoops(linalgOp);
       if (numLoops == 0 ||
           llvm::any_of(linalgOp.getIndexingMaps(), [](AffineMap &map) {
             return !map.isProjectedPermutation();
           })) {
         return llvm::None;
       }

       DISPATCH(linalg::GenericOp)
     }
   }

 #undef DISPATCH

   if (!rootOperation) {
     return llvm::None;
   }

   launchConfig.setRootOperation(*rootOperation);
   if (options.workgroupSize.empty()) {
     launchConfig.setWorkgroupSize(config.workgroupSize);
     launchConfig.setVectorize(config.vectorize);
   }
   launchConfig.setNumSubgroups(config.numSubgroups);

   if (failed(propogateRootOperationLaunchConfig(launchConfig, *rootOperation,
                                                 dependenceGraph)))
     return llvm::None;

   // TODO(ravishankarm): Verify that the set configurations is within the device
   // limits.
   return launchConfig;
 }

 template <typename OpTy>
 static Optional<SmallVector<int64_t, 4>> getOpNativeVectorSize(OpTy op) {
   return llvm::None;
 }

 template <>
 Optional<SmallVector<int64_t, 4>> getOpNativeVectorSize<vector::ContractionOp>(
     vector::ContractionOp op) {
   auto targetEnvAttr = spirv::lookupTargetEnv(op);
   auto targetEnv = spirv::TargetEnv(targetEnvAttr);
   if (targetEnv.allows(spirv::Capability::CooperativeMatrixNV) &&
       targetEnv.allows(spirv::Extension::SPV_NV_cooperative_matrix)) {
     return getCooperativeMatmulSubgroupSize(
         targetEnv.getResourceLimits(), op.getLhsType().getElementType(),
         op.getRhsType().getElementType(),
         op.getAccType().cast<VectorType>().getElementType(),
         op.getResultType().cast<VectorType>().getElementType());
   } else {
     unsigned lastParalleldim = 0;
     for (auto it : llvm::enumerate(op.iterator_types())) {
       if (isParallelIterator(it.value())) lastParalleldim = it.index();
     }
     SmallVector<int64_t, 4> nativeSize(op.iterator_types().size(), 1);
     nativeSize[lastParalleldim] = 4;
     // Map to vec4 fma operations.
     return nativeSize;
   }
 }

 template <>
 Optional<SmallVector<int64_t, 4>> getOpNativeVectorSize<vector::FMAOp>(
     vector::FMAOp op) {
   SmallVector<int64_t, 4> size(op.getType().getRank(), 1);
   size.back() = 4;
   return size;
 }

 template <>
 Optional<SmallVector<int64_t, 4>> getOpNativeVectorSize<vector::TransferReadOp>(
     vector::TransferReadOp op) {
   auto targetEnv = spirv::TargetEnv(spirv::lookupTargetEnv(op));
   if (targetEnv.allows(spirv::Capability::CooperativeMatrixNV) &&
       targetEnv.allows(spirv::Extension::SPV_NV_cooperative_matrix)) {
     // Unroll cooperative martrix load based on the size of the contract.
     VectorType dstVec;
     for (Operation *users : op->getUsers()) {
       auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
       if (!extract) return llvm::None;
       auto vecType = extract.getResult().getType().cast<VectorType>();
       if (dstVec && dstVec != vecType) return llvm::None;
       dstVec = vecType;
     }
     return SmallVector<int64_t, 4>(dstVec.getShape().begin(),
                                    dstVec.getShape().end());
   }

   // Map to load4.
   auto rank = op.getVectorType().getRank();
   SmallVector<int64_t, 4> nativeSize(rank, 1);
   // Load 4 elements on the most inner dimension.
   for (auto dim : llvm::enumerate(op.permutation_map().getResults())) {
     if (auto dimExpr = dim.value().dyn_cast<AffineDimExpr>()) {
       if (dimExpr.getPosition() == op.permutation_map().getNumDims() - 1)
         nativeSize[dim.index()] = 4;
     }
   }
   return nativeSize;
 }

 template <>
 Optional<SmallVector<int64_t, 4>>
 getOpNativeVectorSize<vector::TransferWriteOp>(vector::TransferWriteOp op) {
   auto targetEnv = spirv::TargetEnv(spirv::lookupTargetEnv(op));
   if (targetEnv.allows(spirv::Capability::CooperativeMatrixNV) &&
       targetEnv.allows(spirv::Extension::SPV_NV_cooperative_matrix)) {
     // Unroll cooperative martrix store based on the size of the contract.
     auto insert = op.vector().getDefiningOp<vector::InsertStridedSliceOp>();
     if (!insert) return llvm::None;
     ArrayRef<int64_t> shape = insert.getSourceVectorType().getShape();
     return SmallVector<int64_t, 4>(shape.begin(), shape.end());
   }

   // Map to store4.
   auto rank = op.getVectorType().getRank();
   SmallVector<int64_t, 4> nativeSize(rank, 1);
   // Store 4 elements on the most inner dimension.
   for (auto dim : llvm::enumerate(op.permutation_map().getResults())) {
     if (auto dimExpr = dim.value().dyn_cast<AffineDimExpr>()) {
       if (dimExpr.getPosition() == op.permutation_map().getNumDims() - 1)
         nativeSize[dim.index()] = 4;
     }
   }
   return nativeSize;
 }

 Optional<SmallVector<int64_t, 4>> getSPIRVNativeVectorSize(Operation *op) {
 #define DISPATCH(opname)                            \
   if (isa<opname>(op)) {                            \
     return getOpNativeVectorSize(cast<opname>(op)); \
   }

   DISPATCH(vector::ContractionOp)
   DISPATCH(vector::FMAOp)
   DISPATCH(vector::TransferReadOp)
   DISPATCH(vector::TransferWriteOp)

 #undef DISPATCH

   if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1) {
     if (auto vecType = op->getResultTypes()[0].dyn_cast<VectorType>()) {
       // Map elementwise ops to vec4.
       SmallVector<int64_t, 4> nativeSize(vecType.getRank() - 1, 1);
       nativeSize.push_back(4);
       return nativeSize;
     }
   }
   return llvm::None;
 }

 }  // namespace iree_compiler
 }  // namespace mlir