iree/compiler/Codegen/SPIRV/KernelConfig.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2020 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/compiler/Codegen/SPIRV/KernelConfig.h"

 #include "iree/compiler/Codegen/SPIRV/Utils.h"
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/MarkerUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/HAL/IR/LoweringConfig.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/IR/Matchers.h"

 #define DEBUG_TYPE "iree-spirv-kernel-config"

 namespace mlir {
 namespace iree_compiler {

 //===----------------------------------------------------------------------===//
 // Utilities
 //===----------------------------------------------------------------------===//

 /// Defines the workgroup count region on entry point ops for the
 /// `SPIRVDistributeToGlobalID` pipeline.
 // TODO(ravishankarm): Remove this when that pipeline is deprecated.
 static LogicalResult setTranslationUsingDistributeToGlobalId(
     FuncOp funcOp, ArrayRef<int64_t> workgroupSize) {
   auto entryPointOp = getEntryPoint(funcOp);
   MLIRContext *context = entryPointOp.getContext();
   auto translationInfo = buildTranslationInfo(
       IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistributeToGlobalID,
       /*workloadPerWorkgroup =*/{}, context);
   setTranslationInfo(entryPointOp, translationInfo, workgroupSize);
   OpBuilder builder(context);
   int64_t workgroupSizeX = workgroupSize[0];
   auto numWorkgroupsFn = [workgroupSizeX](OpBuilder &b, Location loc,
                                           std::array<Value, 3> workload) {
     AffineExpr e1, e2, e3;
     bindSymbols(b.getContext(), e1, e2, e3);
     AffineExpr expr = e1 * e2 * e3;
     expr = expr.ceilDiv(workgroupSizeX);
     Value numWorkgroupsX = linalg::applyMapToValues(
         b, loc, AffineMap::get(0, 3, expr), workload)[0];
     Value one = b.create<arith::ConstantIndexOp>(loc, 1);
     return std::array<Value, 3>{numWorkgroupsX, one, one};
   };
   return defineWorkgroupCountRegion(builder, funcOp, numWorkgroupsFn);
 }

 //===----------------------------------------------------------------------===//
 // Convolution Default Configuration
 //===----------------------------------------------------------------------===//

 /// Lets the entry point region to return fully static number of workgroups.
 // This is needed for folding `affine.min` ops to expose static-shaped tiled
 // convolution for vectorization.
 // TODO(#5034): Use a proper way to prove tilability and fold `affine.min`s.
 static LogicalResult defineConvWorkgroupCountRegion(
     Operation *op, ArrayRef<int64_t> outputShape,
     ArrayRef<int64_t> workgroupTileSizes) {
   auto numWorkgroupsFn = [&](OpBuilder &b, Location loc, std::array<Value, 3>) {
     std::array<Value, 3> xyz;
     for (unsigned i = 0; i < 3; ++i) {
       int64_t count = outputShape[i] / workgroupTileSizes[i];
       // This is meant for perfectly tilable cases. Double check that.
       assert(outputShape[i] % workgroupTileSizes[i] == 0 && count != 0);
       xyz[2 - i] = b.create<arith::ConstantIndexOp>(loc, count);
     }
     return xyz;
   };
   OpBuilder builder(op->getContext());
   return defineWorkgroupCountRegion(builder, op->getParentOfType<FuncOp>(),
                                     numWorkgroupsFn);
 }

 namespace detail {

 LogicalResult setConvOpConfig(linalg::LinalgOp linalgOp,
                               const int64_t subgroupSize,
                               const int64_t bestTilingFactor) {
   ArrayRef<int64_t> inputShape = getUntiledShape(linalgOp.inputs()[0]);
   ArrayRef<int64_t> outputShape = getUntiledResultShape(linalgOp, 0);
   if (llvm::any_of(inputShape, ShapedType::isDynamic)) return success();
   if (llvm::any_of(outputShape, ShapedType::isDynamic)) return success();

   int64_t ic = inputShape[3];
   int64_t oh = outputShape[1], ow = outputShape[2], oc = outputShape[3];

   // The conversion pipeline requires the input channel dimension to be some
   // multipler of four, or less than four.
   if (!(ic % 4 == 0 || ic < 4)) return success();

   // The core idea is to distribute the convolution OH/OW/OC dimension to the
   // workgroup Z/Y/X dimension, with each thread in a workgroup handling
   // multiple vector elements. We try to 1) utilize all threads in a subgroup,
   // and 2) handle an optimal tile size along each dimension.

   int64_t residualThreads = subgroupSize;
   int64_t residualTilingFactor = bestTilingFactor;

   SmallVector<int64_t, 3> workgroupSize(3, 1);        // (X, Y, Z)
   SmallVector<int64_t, 4> workgroupTileSizes(4, 0);   // (N, OH, OW, OC)
   SmallVector<int64_t, 4> invocationTileSizes(4, 0);  // (N, OH, OW, OC)

   // Deduce the configuration for the OC dimension.
   for (int64_t x = residualThreads; x >= 2; x >>= 1) {
     // Handle 4 elements per thread for the innermost dimension. We need this
     // for vectorized load.
     int64_t chosenTileSize = 4;
     if (oc % (x * chosenTileSize) == 0) {
       workgroupSize[0] = x;
       workgroupTileSizes[3] = x * chosenTileSize;
       invocationTileSizes[3] = chosenTileSize;
       residualThreads /= x;
       residualTilingFactor /= chosenTileSize;
       break;
     }
   }
   if (workgroupTileSizes[3] == 0) return success();

   // Deduce the configruation for the OW and OH dimension. Try to make them even
   // if possible given we typically have images with the same height and width.
   bool tileToSquare = false;
   unsigned log2Threads = llvm::Log2_64(residualThreads);
   if (ow == oh && residualThreads != 1 && log2Threads % 2 == 0) {
     int64_t yz = 1ll << (log2Threads / 2);

     int64_t chosenTileSize = 1ll << (llvm::Log2_64(residualTilingFactor) / 2);
     while (chosenTileSize >= 1 && ow % (yz * chosenTileSize) != 0) {
       chosenTileSize >>= 1;
     }

     if (chosenTileSize != 0) {
       workgroupSize[1] = workgroupSize[2] = yz;
       workgroupTileSizes[2] = workgroupTileSizes[1] = yz * chosenTileSize;
       invocationTileSizes[2] = invocationTileSizes[1] = chosenTileSize;
       tileToSquare = true;
     }
   }

   // Otherwise treat OW and OH separately to allow them to have different number
   // of threads and tiling size.
   if (!tileToSquare) {
     // Decide the tiling and distribution parameters for one dimension.
     auto decideOneDim = [&](int64_t inputDim, int64_t &wgDimSize,
                             int64_t &wgTileSize, int64_t &invoTileSize) {
       for (int64_t dim = residualThreads; dim >= 1; dim >>= 1) {
         int64_t chosenTileSize = 0;
         for (int64_t t = residualTilingFactor; t >= 1; t >>= 1) {
           if (inputDim % (dim * t) == 0) {
             chosenTileSize = t;
             break;
           }
         }
         if (chosenTileSize) {
           wgDimSize = dim;
           wgTileSize = dim * chosenTileSize;
           invoTileSize = chosenTileSize;
           residualThreads /= dim;
           residualTilingFactor /= chosenTileSize;
           return true;
         }
       }
       return false;
     };

     if (!decideOneDim(ow, workgroupSize[1], workgroupTileSizes[2],
                       invocationTileSizes[2]) ||
         !decideOneDim(oh, workgroupSize[2], workgroupTileSizes[1],
                       invocationTileSizes[1])) {
       return success();
     }
   }

   auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
   TileSizesListType tileSizes;
   tileSizes.push_back(workgroupTileSizes);
   tileSizes.push_back(invocationTileSizes);
   // Tiling along reduction dimensions
   if (isa<linalg::Conv2DNhwcHwcfOp>(linalgOp)) {
     tileSizes.push_back({0, 0, 0, 0, 1, 1, 4});
   } else if (isa<linalg::DepthwiseConv2DNhwOp>(linalgOp)) {
     tileSizes.push_back({0, 0, 0, 0, 1, 1});
   } else {
     return success();
   }

   auto funcOp = linalgOp->getParentOfType<FuncOp>();
   if (failed(setOpConfigAndEntryPointFnTranslation(
           funcOp, linalgOp, tileSizes, {}, pipeline, workgroupSize))) {
     return failure();
   }
   return defineConvWorkgroupCountRegion(
       linalgOp, llvm::makeArrayRef(outputShape).drop_front(),
       llvm::makeArrayRef(workgroupTileSizes).drop_front());
 }

 }  // namespace detail

 //===----------------------------------------------------------------------===//
 // Matmul Default Configuration
 //===----------------------------------------------------------------------===//

 namespace detail {

 LogicalResult setMatmulOpConfig(linalg::LinalgOp op,
                                 std::array<int64_t, 2> bestWorkgroupSizeXY,
                                 std::array<int64_t, 3> bestThreadTileSizeMNK) {
   auto lhsType = op.inputs()[0].getType().cast<ShapedType>();
   auto elementBits = lhsType.getElementType().getIntOrFloatBitWidth();
   if (elementBits != 16 && elementBits != 32) return success();

   ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
   ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);
   if (llvm::any_of(lhsShape, ShapedType::isDynamic)) return success();
   if (llvm::any_of(rhsShape, ShapedType::isDynamic)) return success();

   bool isBM = isa<linalg::BatchMatmulOp>(op);

   int64_t dimM = lhsShape[0 + isBM];
   int64_t dimK = lhsShape[1 + isBM];
   int64_t dimN = rhsShape[1 + isBM];

   // The core idea is to distribute the matmul M/N dimension to the workgroup
   // Y/X dimension, with each thread in a workgroup handling multiple vector
   // elements. We start from the best (X, Y) and the tiling sizes for (M, N, K)
   // and try different configurations by scaling them down until we find a
   // configuration that can perfectly tile the input matmul.

   const int64_t bestX = bestWorkgroupSizeXY[0], bestY = bestWorkgroupSizeXY[1];
   const int64_t bestThreadM = bestThreadTileSizeMNK[0],
                 bestThreadN = bestThreadTileSizeMNK[1],
                 bestThreadK = bestThreadTileSizeMNK[2];

   int64_t residualThreads = bestX * bestY;
   int64_t residualTilingFactor = (bestThreadM + bestThreadK) * bestThreadN;

   SmallVector<int64_t, 3> workgroupSize(3, 1);               // (X, Y, Z)
   SmallVector<int64_t, 4> workgroupTileSizes(2 + isBM, 0);   // (B, M, N)
   SmallVector<int64_t, 4> invocationTileSizes(2 + isBM, 0);  // (B, M, N)
   SmallVector<int64_t, 4> reductionTileSizes(3 + isBM, 0);   // (B, M, N, K)

   if (isBM) workgroupTileSizes[0] = invocationTileSizes[0] = 1;

   // Deduce the configuration for the N dimension. Start with the best workgroup
   // X size, and reduce by a factor of two each time.
   for (int64_t x = bestX; x >= 2; x >>= 1) {
     // Handle 4 elements per thread for the innermost dimension. We need this
     // for vectorized load.
     int64_t chosenTileSize = bestThreadN;
     if (dimN % (x * chosenTileSize) == 0) {
       workgroupSize[0] = x;
       workgroupTileSizes[1 + isBM] = x * chosenTileSize;
       invocationTileSizes[1 + isBM] = chosenTileSize;
       residualThreads /= x;
       assert(residualTilingFactor % chosenTileSize == 0);
       residualTilingFactor /= chosenTileSize;
       break;
     }
   }
   if (workgroupTileSizes[1 + isBM] == 0) return success();

   // Deduce the configuration for the M dimension. Start with the best workgroup
   // Y size, and reduce by a factor of two each time.
   for (int64_t y = residualThreads; y >= 1; y >>= 1) {
     int64_t chosenTileSize = 0;
     // Reduce the thread tiling size by one each time. We read one row each
     // time; so it's fine to not be some power of two here.
     for (int64_t t = bestThreadM; t >= 1; --t) {
       if (dimM % (y * t) == 0) {
         chosenTileSize = t;
         break;
       }
     }
     if (chosenTileSize) {
       workgroupSize[1] = y;
       workgroupTileSizes[0 + isBM] = y * chosenTileSize;
       invocationTileSizes[0 + isBM] = chosenTileSize;
       assert(residualTilingFactor > chosenTileSize);
       residualTilingFactor -= chosenTileSize;
       break;
     }
   }
   if (workgroupTileSizes[0 + isBM] == 0) return success();

   // Deduce the configuration for the K dimension. We need some power of two
   // here so that we can do vector load.
   for (int64_t t = llvm::PowerOf2Floor(residualTilingFactor); t >= 2; t >>= 1) {
     if (dimK % t == 0) {
       reductionTileSizes[2 + isBM] = t;
       break;
     }
   }
   if (reductionTileSizes[2 + isBM] == 0) return success();

   auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
   TileSizesListType tileSizes;
   tileSizes.push_back(workgroupTileSizes);
   tileSizes.push_back(invocationTileSizes);
   tileSizes.push_back(reductionTileSizes);
   return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
                                                op, tileSizes, {}, pipeline,
                                                workgroupSize);
 }

 }  // namespace detail

 //===----------------------------------------------------------------------===//
 // FFT Default Configuration
 //===----------------------------------------------------------------------===//

 static LogicalResult setOpConfig(spirv::ResourceLimitsAttr limits,
                                  linalg_ext::FftOp op) {
   const int64_t subgroupSize = limits.subgroup_size().getValue().getSExtValue();
   auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute;

   std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1};

   auto partitionedLoops = getPartitionedLoops(op);
   unsigned loopDepth = partitionedLoops.back() + 1;
   SmallVector<int64_t, 4> workgroupTileSize(loopDepth, 0);

   // Tiling along partitioned loops with size 1.
   for (int64_t loopIndex : partitionedLoops) {
     workgroupTileSize[loopIndex] = 1;
   }
   auto rank = op.getOperandRank();
   if (workgroupTileSize.size() >= rank && workgroupTileSize[rank - 1] != 0) {
     APInt value;
     if (matchPattern(op.getStage(), m_ConstantInt(&value))) {
       workgroupTileSize[rank - 1] = 1ll << value.getSExtValue();
     } else {
       op.emitError("non-constant stage might not work for fft op");
       return failure();
     }
   }
   TileSizesListType tileSizes = {workgroupTileSize};
   return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
                                                op, tileSizes, {}, pipeline,
                                                workgroupSize);
 }

 //===----------------------------------------------------------------------===//
 // Default Configuration
 //===----------------------------------------------------------------------===//

 static LogicalResult setDefaultOpConfig(spirv::ResourceLimitsAttr limits,
                                         Operation *op) {
   auto partitionedLoops = getPartitionedLoops(op);
   if (partitionedLoops.empty()) {
     auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
     std::array<int64_t, 3> workgroupSize = {1, 1, 1};
     auto funcOp = op->getParentOfType<FuncOp>();
     return setOpConfigAndEntryPointFnTranslation(funcOp, op, {}, {}, pipeline,
                                                  workgroupSize);
   }

   const int64_t subgroupSize = limits.subgroup_size().getValue().getSExtValue();
   int64_t numElementsPerWorkgroup = subgroupSize;
   int64_t numElementsPerThread = 1;
   auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute;

   // Returns true if the given `operand` has 32-bit element type.
   auto has32BitElementType = [](Value operand) {
     auto shapedType = operand.getType().dyn_cast<ShapedType>();
     Type elementType =
         (shapedType ? shapedType.getElementType() : operand.getType());
     return elementType.isa<FloatType>() || elementType.isInteger(32);
   };

   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
     bool vectorize = false;
     auto outputShape = getUntiledResultShape(linalgOp, 0);

     if (!linalgOp.hasIndexSemantics() &&
         // Skip vectorization for non-minor identity inputs as it generates
         // vector.transfer_read ops with permutation maps that we currently
         // cannot lower.
         // TODO: Remove this restriction once the lowering of the permutation
         // map is supported in core.
         llvm::all_of(linalgOp.getIndexingMaps(),
                      [](AffineMap &map) { return map.isMinorIdentity(); }) &&
         // TODO(thomasraoux): Lowering of integers other than i32 may require
         // emulation. This is currently not supported for vector operation.
         // Re-enable this when the bug is fixed on SPIR-V lowering side.
         llvm::all_of(linalgOp->getOperands(), has32BitElementType) &&
         llvm::all_of(outputShape,
                      [](int64_t dim) { return !ShapedType::isDynamic(dim); })) {
       vectorize = true;
     }

     SmallVector<int64_t, 4> candidateTileSizes;
     if (vectorize) candidateTileSizes.push_back(4 * subgroupSize);
     candidateTileSizes.push_back(subgroupSize);

     for (int64_t size : candidateTileSizes) {
       if (outputShape.back() % size != 0) continue;
       numElementsPerWorkgroup = size;
       break;
     }

     if (numElementsPerWorkgroup <= subgroupSize ||
         outputShape.back() % numElementsPerWorkgroup != 0) {
       vectorize = false;
     }

     if (vectorize) {
       numElementsPerThread = numElementsPerWorkgroup / subgroupSize;
       pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
     }
   }

   std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1};

   unsigned loopDepth = partitionedLoops.back() + 1;
   SmallVector<int64_t, 4> workgroupTileSize(loopDepth, 0);
   SmallVector<int64_t, 4> threadTileSize(loopDepth, 0);

   // Tiling along partitioned loops with size 1.
   for (int64_t loopIndex : partitionedLoops) {
     workgroupTileSize[loopIndex] = threadTileSize[loopIndex] = 1;
   }
   // Overwrite the configuration for the innermost dimension.
   workgroupTileSize.back() = numElementsPerWorkgroup;
   threadTileSize.back() = numElementsPerThread;

   TileSizesListType tileSizes;
   tileSizes.push_back(workgroupTileSize);
   tileSizes.push_back(threadTileSize);

   return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
                                                op, tileSizes, {}, pipeline,
                                                workgroupSize);
 }

 /// Sets the CodeGen configuration as attributes to the given `rootOp` if it's a
 /// known Linalg matmul/convolution op with good configurations.
 static LogicalResult setSPIRVOpConfig(const spirv::TargetEnv &targetEnv,
                                       Operation *rootOp) {
   LogicalResult result = success();
   // First try to find a proper CodeGen configuration to tile and vectorize for
   // the current target architecture.
   switch (targetEnv.getVendorID()) {
     case spirv::Vendor::ARM:
       result = detail::setMaliCodeGenConfig(targetEnv, rootOp);
       break;
     case spirv::Vendor::NVIDIA:
       result = detail::setNVIDIACodeGenConfig(targetEnv, rootOp);
       break;
     case spirv::Vendor::Qualcomm:
       result = detail::setAdrenoCodeGenConfig(targetEnv, rootOp);
       break;
     default:
       break;
   }

   if (failed(result)) return result;
   // Check whether there is actually a configuration found. If so, it's done.
   if (getLoweringConfig(rootOp)) return result;

   // Otherwise fallback to use a default configuration that tiles and
   // distributes/vectorizes.
   spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
   return TypeSwitch<Operation *, LogicalResult>(rootOp)
       .Case<linalg::BatchMatmulOp, linalg::MatmulOp>([limits](auto op) {
         // Try to tile and vectorize first.
         std::array<int64_t, 2> workgroupXY = {32, 2};
         std::array<int64_t, 3> threadMNK = {8, 8, 4};
         auto result = detail::setMatmulOpConfig(op, workgroupXY, threadMNK);
         if (failed(result)) return result;
         if (getLoweringConfig(op)) return result;

         // If unsuccessful, try to tile and distribute.
         return setDefaultOpConfig(limits, op);
       })
       .Case<linalg_ext::FftOp>(
           [limits](auto op) { return setOpConfig(limits, op); })
       .Case<linalg::Conv2DNhwcHwcfOp, linalg::DepthwiseConv2DNhwOp>(
           [limits](auto op) {
             // Try to tile and vectorize first. It's common to see 32 threads
             // per subgroup for GPUs.
             auto result = detail::setConvOpConfig(op, /*subgroupSize=*/32,
                                                   /*bestTilingFactor=*/32);
             if (failed(result)) return result;
             if (getLoweringConfig(op)) return result;

             // If unsuccessful, try to tile and distribute.
             return setDefaultOpConfig(limits, op);
           })
       .Case<linalg::GenericOp>([limits](auto op) {
         // If generic op has reduction iterator types, it is a root as
         // well. Just set the default configuration, which marks it as a root.
         if (op.getNumLoops() != op.getNumParallelLoops()) {
           return setDefaultOpConfig(limits, op);
         }
         return success();
       })
       .Default([](Operation *) { return success(); });
 };

 //===----------------------------------------------------------------------===//
 // Entry Point
 //===----------------------------------------------------------------------===//

 LogicalResult initSPIRVLaunchConfig(ModuleOp module) {
   llvm::StringMap<IREE::HAL::ExecutableEntryPointOp> entryPointOps =
       getAllEntryPoints(module);
   spirv::TargetEnvAttr targetEnvAttr = getSPIRVTargetEnvAttr(module);
   if (!targetEnvAttr) {
     return module.emitOpError(
         "expected parent hal.executable.variant to have spv.target_env "
         "attribute");
   }
   spirv::TargetEnv targetEnv(targetEnvAttr);
   spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();

   for (auto funcOp : module.getOps<FuncOp>()) {
     auto entryPointOp = entryPointOps.lookup(funcOp.getName());
     if (!entryPointOp) continue;
     if (getTranslationInfo(entryPointOp)) continue;

     SmallVector<Operation *> computeOps;
     SmallVector<TiledLoopInfo> tiledLoops;
     if (failed(getComputeOps(funcOp, computeOps, tiledLoops))) {
       return funcOp.emitOpError("failed to get compute ops");
     }

     int64_t subgroupSize = limits.subgroup_size().getValue().getSExtValue();

     // If the dispatch region does not contain tiled and distributed Linalg ops,
     // invoke the pipeline to distribute to global invocations.
     if (tiledLoops.empty() && llvm::none_of(computeOps, [](Operation *op) {
           return hasMarker(op, getWorkgroupMarker());
         })) {
       std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1};
       if (failed(
               setTranslationUsingDistributeToGlobalId(funcOp, workgroupSize))) {
         return computeOps[0]->emitOpError(
             "failed to set translation info for distributing to global IDs");
       }
       continue;
     }

     Operation *rootOperation = nullptr;

     // Try to find a configuration according to a matmul/convolution op and use
     // it as the root op.
     for (Operation *computeOp : computeOps) {
       if (failed(setSPIRVOpConfig(targetEnv, computeOp))) return failure();

       // Check if the op configuration was set.
       if (!getLoweringConfig(computeOp)) continue;

       if (rootOperation) {
         return computeOp->emitOpError(
             "unhandled multiple roots in dispatch region");
       }
       rootOperation = computeOp;
     }

     // If there are still no root op, check for any linalg.generic op.
     if (!rootOperation) {
       for (Operation *computeOp : reverse(computeOps)) {
         if (failed(setDefaultOpConfig(limits, computeOp))) return failure();

         // Check if the op configuration was set.
         if (!getLoweringConfig(computeOp)) {
           return computeOp->emitOpError(
               "without known roots, the last compute operation in the tiled "
               "loop body is expected to be set as root");
         }
         rootOperation = computeOp;
         break;
       }
     }

     if (!rootOperation) {
       // If the tiled loops are not empty then this could be a corner case of
       // tensor.insert_slice being tiled and distributed, that just shows up as
       // a `flow.dispatch.tensor.load` and a `flow.dispatch.tensor.store` (or as
       // a copy. For now just treat the tiled loops not being empty as an
       // indicator of that. Need a better way of information flow from flow
       // dialect to hal.
       if (!tiledLoops.empty()) {
         const int64_t subgroupSize =
             limits.subgroup_size().getValue().getSExtValue();
         std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1};
         SmallVector<int64_t> workloadPerWorkgroup(tiledLoops.size(), 1);
         workloadPerWorkgroup.front() = subgroupSize * 4;
         setTranslationInfo(
             funcOp, IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute,
             workgroupSize, workloadPerWorkgroup);
         return success();
       }
       return funcOp.emitError("contains no root Linalg operation");
     }

     // Propogate the `lowering.config` attribute to the other ops.
     // TODO(ravishankarm, antiagainst): This is a very specific use (and
     // fragile). In general, this should not be needed. Things are already tiled
     // and distributed. The rest of the compilation must be structured to either
     // use `TileAndFuse` or they are independent configurations that are
     // determined based on the op.
     IREE::HAL::LoweringConfig config = getLoweringConfig(rootOperation);
     for (auto op : computeOps) {
       if (op == rootOperation) continue;
       setLoweringConfig(op, config);
     }
   }
   return success();
 }

 }  // namespace iree_compiler
 }  // namespace mlir