iree/compiler/Codegen/SPIRV/KernelConfig.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2020 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/compiler/Codegen/SPIRV/KernelConfig.h"

 #include "iree/compiler/Codegen/SPIRV/Utils.h"
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/MarkerUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/HAL/IR/LoweringConfig.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"

 #define DEBUG_TYPE "iree-spirv-kernel-config"

 namespace mlir {
 namespace iree_compiler {

 //===----------------------------------------------------------------------===//
 // Utilities
 //===----------------------------------------------------------------------===//

 /// Given `nprocs`, tries to distribute it evenly across 2 logical dimensions.
 static std::tuple<int64_t, int64_t> distributeProcs2D(int64_t nprocs) {
   int64_t nprocs_x = std::max<int64_t>(
       1, static_cast<int64_t>(
              llvm::PowerOf2Ceil(static_cast<uint64_t>(std::sqrt(nprocs)))));
   return std::make_tuple(nprocs_x, nprocs / nprocs_x);
 }

 /// Returns the minimum of `shape` and `tileSize` if shape is static.
 /// Returns `tileSize` otherwise.
 int64_t getMinIfStaticShape(int64_t shape, int64_t tileSize) {
   if (shape == ShapedType::kDynamicSize) return tileSize;
   return std::min(shape, tileSize);
 }

 /// Defines the workgroup count region on entry point ops for the
 /// `SPIRVDistributeToGlobalID` pipeline.
 // TODO(ravishankarm): Remove this when that pipeline is deprecated.
 static LogicalResult setTranslationUsingDistributeToGlobalId(
     FuncOp funcOp, ArrayRef<int64_t> workgroupSize) {
   auto entryPointOp = getEntryPoint(funcOp);
   MLIRContext *context = entryPointOp.getContext();
   auto translationInfo = buildTranslationInfo(
       IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistributeToGlobalID,
       /*workloadPerWorkgroup =*/{}, context);
   setTranslationInfo(entryPointOp, translationInfo, workgroupSize);
   OpBuilder builder(context);
   int64_t workgroupSizeX = workgroupSize[0];
   auto numWorkgroupsFn = [workgroupSizeX](OpBuilder &b, Location loc,
                                           std::array<Value, 3> workload) {
     AffineExpr e1, e2, e3;
     bindSymbols(b.getContext(), e1, e2, e3);
     AffineExpr expr = e1 * e2 * e3;
     expr = expr.ceilDiv(workgroupSizeX);
     Value numWorkgroupsX = linalg::applyMapToValues(
         b, loc, AffineMap::get(0, 3, expr), workload)[0];
     Value one = b.create<ConstantIndexOp>(loc, 1);
     return std::array<Value, 3>{numWorkgroupsX, one, one};
   };
   return defineWorkgroupCountRegion(builder, funcOp, numWorkgroupsFn);
 }

 namespace detail {
 LogicalResult defineConvWorkgroupCountRegion(
     Operation *op, ArrayRef<int64_t> outputShape,
     ArrayRef<int64_t> workgroupTileSizes) {
   auto numWorkgroupsFn = [&](OpBuilder &b, Location loc, std::array<Value, 3>) {
     std::array<Value, 3> xyz;
     for (unsigned i = 0; i < 3; ++i) {
       int64_t count = outputShape[i] / workgroupTileSizes[i];
       xyz[2 - i] = b.create<ConstantIndexOp>(loc, count);
     }
     return xyz;
   };
   OpBuilder builder(op->getContext());
   return defineWorkgroupCountRegion(builder, op->getParentOfType<FuncOp>(),
                                     numWorkgroupsFn);
 }
 }  // namespace detail

 //===----------------------------------------------------------------------===//
 // Matmul Default Configuration
 //===----------------------------------------------------------------------===//

 static LogicalResult setOpConfig(spirv::ResourceLimitsAttr limits,
                                  linalg::BatchMatmulOp op) {
   unsigned maxWorkgroupSize =
       limits.max_compute_workgroup_invocations().getInt();

   // This is just being hard-wired for now to be minimal viable, but this can be
   // decided better when we have better estimates of device charecteristics.
   const int64_t numRowsPerThread = 1;
   const int64_t numColsPerThread = 1;
   const int64_t numBatchesPerThread = 1;
   const int64_t tileSizeK = 0;

   std::array<int64_t, 3> workgroupSize = {1, 1, 1};
   std::tie(workgroupSize[0], workgroupSize[1]) =
       distributeProcs2D(maxWorkgroupSize);

   auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute;

   TileSizesListType tileSizes;
   // Workgroup level.
   tileSizes.push_back({numBatchesPerThread, numRowsPerThread * workgroupSize[1],
                        numColsPerThread * workgroupSize[0], tileSizeK});
   // No tiling at the subgroup level since this target doesn't use subgroup op
   // or shared memory.
   tileSizes.emplace_back();
   // Invocation level.
   tileSizes.push_back(
       {numBatchesPerThread, numRowsPerThread, numColsPerThread, 0});

   return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
                                                op, tileSizes, {}, pipeline,
                                                workgroupSize);
 }

 static LogicalResult setOpConfig(spirv::ResourceLimitsAttr limits,
                                  linalg::MatmulOp op) {
   unsigned maxWorkgroupSize =
       limits.max_compute_workgroup_invocations().getInt();

   std::array<int64_t, 3> workgroupSize = {1, 1, 1};
   std::tie(workgroupSize[0], workgroupSize[1]) =
       distributeProcs2D(maxWorkgroupSize);

   const int numRowsPerThread = 1;
   const int numColsPerThread = 1;
   int64_t tileSizeK = 0;

   ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
   ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);

   int64_t M = lhsShape[0];
   int64_t N = rhsShape[1];
   int64_t K = lhsShape[1];

   auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute;

   TileSizesListType tileSizes;
   // Workgroup level.
   tileSizes.push_back(
       {getMinIfStaticShape(M, numRowsPerThread * workgroupSize[1]),
        getMinIfStaticShape(N, numColsPerThread * workgroupSize[0]),
        getMinIfStaticShape(K, tileSizeK)});
   // No tiling at the subgroup level since this target doesn't use subgroup op
   // or shared memory.
   tileSizes.emplace_back();
   // Invocation level.
   tileSizes.push_back({1, 1, 0});

   return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
                                                op, tileSizes, {}, pipeline,
                                                workgroupSize);
 }

 //===----------------------------------------------------------------------===//
 // Default Configuration
 //===----------------------------------------------------------------------===//

 static LogicalResult setDefaultOpConfig(spirv::ResourceLimitsAttr limits,
                                         Operation *op) {
   auto partitionedLoops = getPartitionedLoops(op);
   if (partitionedLoops.empty()) {
     auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
     std::array<int64_t, 3> workgroupSize = {1, 1, 1};
     auto funcOp = op->getParentOfType<FuncOp>();
     return setOpConfigAndEntryPointFnTranslation(funcOp, op, {}, {}, pipeline,
                                                  workgroupSize);
   }

   const int64_t subgroupSize = limits.subgroup_size().getValue().getSExtValue();
   int64_t numElementsPerWorkgroup = subgroupSize;
   int64_t numElementsPerThread = 1;
   auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute;

   // Returns true if the given `operand` has 32-bit element type.
   auto has32BitElementType = [](Value operand) {
     auto shapedType = operand.getType().dyn_cast<ShapedType>();
     Type elementType =
         (shapedType ? shapedType.getElementType() : operand.getType());
     return elementType.isa<FloatType>() || elementType.isInteger(32);
   };

   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
     bool vectorize = false;
     auto outputShape = getUntiledResultShape(linalgOp, 0);

     if (!linalgOp.hasIndexSemantics() &&
         // Skip vectorization for non-minor identity inputs as it generates
         // vector.transfer_read ops with permutation maps that we currently
         // cannot lower.
         // TODO: Remove this restriction once the lowering of the permutation
         // map is supported in core.
         llvm::all_of(linalgOp.getIndexingMaps(),
                      [](AffineMap &map) { return map.isMinorIdentity(); }) &&
         // TODO(thomasraoux): Lowering of integers other than i32 may require
         // emulation. This is currently not supported for vector operation.
         // Re-enable this when the bug is fixed on SPIR-V lowering side.
         llvm::all_of(linalgOp->getOperands(), has32BitElementType) &&
         llvm::all_of(outputShape,
                      [](int64_t dim) { return !ShapedType::isDynamic(dim); })) {
       vectorize = true;
     }

     SmallVector<int64_t, 4> candidateTileSizes;
     if (vectorize) candidateTileSizes.push_back(4 * subgroupSize);
     candidateTileSizes.push_back(subgroupSize);

     for (int64_t size : candidateTileSizes) {
       if (outputShape.back() % size != 0) continue;
       numElementsPerWorkgroup = size;
       break;
     }

     if (numElementsPerWorkgroup <= subgroupSize ||
         outputShape.back() % numElementsPerWorkgroup != 0) {
       vectorize = false;
     }

     if (vectorize) {
       numElementsPerThread = numElementsPerWorkgroup / subgroupSize;
       pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
     }
   }

   std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1};

   unsigned loopDepth = partitionedLoops.back() + 1;
   SmallVector<int64_t, 4> workgroupTileSize(loopDepth, 0);
   SmallVector<int64_t, 4> threadTileSize(loopDepth, 0);

   // Tiling along partitioned loops with size 1.
   for (int64_t loopIndex : partitionedLoops) {
     workgroupTileSize[loopIndex] = threadTileSize[loopIndex] = 1;
   }
   // Overwrite the configuration for the innermost dimension.
   workgroupTileSize.back() = numElementsPerWorkgroup;
   threadTileSize.back() = numElementsPerThread;

   TileSizesListType tileSizes;
   tileSizes.push_back(workgroupTileSize);
   tileSizes.emplace_back();  // Subgroup level.
   tileSizes.push_back(threadTileSize);

   return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
                                                op, tileSizes, {}, pipeline,
                                                workgroupSize);
 }

 /// Sets the CodeGen configuration as attributes to the given `rootOp` if it's a
 /// known Linalg matmul/convolution op with good configurations.
 static LogicalResult setSPIRVOpConfig(const spirv::TargetEnv &targetEnv,
                                       Operation *rootOp) {
   LogicalResult result = success();
   // First try to find a proper CodeGen configuration for the current
   // target architecture.
   switch (targetEnv.getVendorID()) {
     case spirv::Vendor::ARM:
       result = detail::setMaliCodeGenConfig(targetEnv, rootOp);
       break;
     case spirv::Vendor::NVIDIA:
       result = detail::setNVIDIACodeGenConfig(targetEnv, rootOp);
       break;
     case spirv::Vendor::Qualcomm:
       result = detail::setAdrenoCodeGenConfig(targetEnv, rootOp);
       break;
     default:
       break;
   }

   if (failed(result)) return result;
   // Check whether there is actually a configuration found. If so, it's done.
   if (getLoweringConfig(rootOp)) return result;

   // Otherwise fallback to use a default configuration.
   spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
   return TypeSwitch<Operation *, LogicalResult>(rootOp)
       .Case<linalg::BatchMatmulOp, linalg::MatmulOp>(
           [limits](auto op) { return setOpConfig(limits, op); })
       .Case<linalg::Conv2DNhwcHwcfOp, linalg::DepthwiseConv2DNhwOp>(
           [limits](auto op) { return setDefaultOpConfig(limits, op); })
       .Default([](Operation *) { return success(); });
 };

 //===----------------------------------------------------------------------===//
 // Entry Point
 //===----------------------------------------------------------------------===//

 LogicalResult initSPIRVLaunchConfig(ModuleOp module) {
   llvm::StringMap<IREE::HAL::ExecutableEntryPointOp> entryPointOps =
       getAllEntryPoints(module);
   spirv::TargetEnvAttr targetEnvAttr = getSPIRVTargetEnvAttr(module);
   if (!targetEnvAttr) {
     return module.emitOpError(
         "expected parent hal.executable.variant to have spv.target_env "
         "attribute");
   }
   spirv::TargetEnv targetEnv(targetEnvAttr);
   spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();

   for (auto funcOp : module.getOps<FuncOp>()) {
     auto entryPointOp = entryPointOps.lookup(funcOp.getName());
     if (!entryPointOp) continue;
     if (getTranslationInfo(entryPointOp)) continue;

     SmallVector<Operation *, 4> computeOps;
     SmallVector<Operation *, 4> tiledLoops;
     if (failed(getComputeOps(funcOp, computeOps, tiledLoops))) {
       return funcOp.emitOpError("failed to get compute ops");
     }

     int64_t subgroupSize =
         targetEnv.getResourceLimits().subgroup_size().getValue().getSExtValue();

     // If the dispatch region does not contain tiled and distributed Linalg ops,
     // invoke the pipeline to distribute to global invocations.
     if (tiledLoops.empty() && llvm::none_of(computeOps, [](Operation *op) {
           return hasMarker(op, getWorkgroupMarker());
         })) {
       std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1};
       if (failed(
               setTranslationUsingDistributeToGlobalId(funcOp, workgroupSize))) {
         return computeOps[0]->emitOpError(
             "failed to set translation info for distributing to global IDs");
       }
       continue;
     }

     Operation *rootOperation = nullptr;

     // Try to find a configuration according to a matmul/convolution op and use
     // it as the root op.
     for (Operation *computeOp : computeOps) {
       if (failed(setSPIRVOpConfig(targetEnv, computeOp))) return failure();

       // Check if the op configuration was set.
       if (!getLoweringConfig(computeOp)) continue;

       if (rootOperation) {
         return computeOp->emitOpError(
             "unhandled multiple roots in dispatch region");
       }
       rootOperation = computeOp;
     }

     // If there are still no root op, check for any linalg.generic op.
     if (!rootOperation) {
       for (Operation *computeOp : reverse(computeOps)) {
         if (failed(setDefaultOpConfig(limits, computeOp))) return failure();

         // Check if the op configuration was set.
         if (!getLoweringConfig(computeOp)) {
           return computeOp->emitOpError(
               "without known roots, the last operation in the tiled loop body "
               "is expected to be set as root");
         }
         rootOperation = computeOp;
         break;
       }
     }

     if (!rootOperation) {
       // If the tiled loops are not empty then this could be a corner case of
       // tensor.insert_slice being tiled and distributed, that just shows up as
       // a `flow.dispatch.tensor.load` and a `flow.dispatch.tensor.store` (or as
       // a copy. For now just treat the tiled loops not being empty as an
       // indicator of that. Need a better way of information flow from flow
       // dialect to hal.
       if (!tiledLoops.empty()) {
         const int64_t subgroupSize =
             limits.subgroup_size().getValue().getSExtValue();
         std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1};
         SmallVector<int64_t> workloadPerWorkgroup(tiledLoops.size(), 1);
         workloadPerWorkgroup.front() = subgroupSize * 4;
         setTranslationInfo(
             funcOp, IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute,
             workgroupSize, workloadPerWorkgroup);
         return success();
       }
       return funcOp.emitError("contains no root Linalg operation");
     }

     // Propogate the `lowering.config` attribute to the other ops.
     // TODO(ravishankarm, antiagainst): This is a very specific use (and
     // fragile). In general, this should not be needed. Things are already tiled
     // and distributed. The rest of the compilation must be structured to either
     // use `TileAndFuse` or they are independent configurations that are
     // determined based on the op.
     IREE::HAL::LoweringConfig config = getLoweringConfig(rootOperation);
     for (auto op : computeOps) {
       if (op == rootOperation) continue;
       setLoweringConfig(op, config);
     }
   }
   return success();
 }

 }  // namespace iree_compiler
 }  // namespace mlir
	// Copyright 2020 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	#include "iree/compiler/Codegen/SPIRV/KernelConfig.h"

	#include "iree/compiler/Codegen/SPIRV/Utils.h"
	#include "iree/compiler/Codegen/Transforms/Transforms.h"
	#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
	#include "iree/compiler/Codegen/Utils/Utils.h"
	#include "iree/compiler/Dialect/HAL/IR/LoweringConfig.h"
	#include "llvm/Support/Debug.h"
	#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
	#include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
	#include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"

	#define DEBUG_TYPE "iree-spirv-kernel-config"

	namespace mlir {
	namespace iree_compiler {

	//===----------------------------------------------------------------------===//
	// Utilities
	//===----------------------------------------------------------------------===//

	/// Given `nprocs`, tries to distribute it evenly across 2 logical dimensions.
	static std::tuple<int64_t, int64_t> distributeProcs2D(int64_t nprocs) {
	int64_t nprocs_x = std::max<int64_t>(
	1, static_cast<int64_t>(
	llvm::PowerOf2Ceil(static_cast<uint64_t>(std::sqrt(nprocs)))));
	return std::make_tuple(nprocs_x, nprocs / nprocs_x);
	}

	/// Returns the minimum of `shape` and `tileSize` if shape is static.
	/// Returns `tileSize` otherwise.
	int64_t getMinIfStaticShape(int64_t shape, int64_t tileSize) {
	if (shape == ShapedType::kDynamicSize) return tileSize;
	return std::min(shape, tileSize);
	}

	/// Defines the workgroup count region on entry point ops for the
	/// `SPIRVDistributeToGlobalID` pipeline.
	// TODO(ravishankarm): Remove this when that pipeline is deprecated.
	static LogicalResult setTranslationUsingDistributeToGlobalId(
	FuncOp funcOp, ArrayRef<int64_t> workgroupSize) {
	auto entryPointOp = getEntryPoint(funcOp);
	MLIRContext *context = entryPointOp.getContext();
	auto translationInfo = buildTranslationInfo(
	IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistributeToGlobalID,
	/workloadPerWorkgroup =/{}, context);
	setTranslationInfo(entryPointOp, translationInfo, workgroupSize);
	OpBuilder builder(context);
	int64_t workgroupSizeX = workgroupSize[0];
	auto numWorkgroupsFn = [workgroupSizeX](OpBuilder &b, Location loc,
	std::array<Value, 3> workload) {
	AffineExpr e1, e2, e3;
	bindSymbols(b.getContext(), e1, e2, e3);
	AffineExpr expr = e1 * e2 * e3;
	expr = expr.ceilDiv(workgroupSizeX);
	Value numWorkgroupsX = linalg::applyMapToValues(
	b, loc, AffineMap::get(0, 3, expr), workload)[0];
	Value one = b.create<ConstantIndexOp>(loc, 1);
	return std::array<Value, 3>{numWorkgroupsX, one, one};
	};
	return defineWorkgroupCountRegion(builder, funcOp, numWorkgroupsFn);
	}

	namespace detail {
	LogicalResult defineConvWorkgroupCountRegion(
	Operation *op, ArrayRef<int64_t> outputShape,
	ArrayRef<int64_t> workgroupTileSizes) {
	auto numWorkgroupsFn = [&](OpBuilder &b, Location loc, std::array<Value, 3>) {
	std::array<Value, 3> xyz;
	for (unsigned i = 0; i < 3; ++i) {
	int64_t count = outputShape[i] / workgroupTileSizes[i];
	xyz[2 - i] = b.create<ConstantIndexOp>(loc, count);
	}
	return xyz;
	};
	OpBuilder builder(op->getContext());
	return defineWorkgroupCountRegion(builder, op->getParentOfType<FuncOp>(),
	numWorkgroupsFn);
	}
	} // namespace detail

	//===----------------------------------------------------------------------===//
	// Matmul Default Configuration
	//===----------------------------------------------------------------------===//

	static LogicalResult setOpConfig(spirv::ResourceLimitsAttr limits,
	linalg::BatchMatmulOp op) {
	unsigned maxWorkgroupSize =
	limits.max_compute_workgroup_invocations().getInt();

	// This is just being hard-wired for now to be minimal viable, but this can be
	// decided better when we have better estimates of device charecteristics.
	const int64_t numRowsPerThread = 1;
	const int64_t numColsPerThread = 1;
	const int64_t numBatchesPerThread = 1;
	const int64_t tileSizeK = 0;

	std::array<int64_t, 3> workgroupSize = {1, 1, 1};
	std::tie(workgroupSize[0], workgroupSize[1]) =
	distributeProcs2D(maxWorkgroupSize);

	auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute;

	TileSizesListType tileSizes;
	// Workgroup level.
	tileSizes.push_back({numBatchesPerThread, numRowsPerThread * workgroupSize[1],
	numColsPerThread * workgroupSize[0], tileSizeK});
	// No tiling at the subgroup level since this target doesn't use subgroup op
	// or shared memory.
	tileSizes.emplace_back();
	// Invocation level.
	tileSizes.push_back(
	{numBatchesPerThread, numRowsPerThread, numColsPerThread, 0});

	return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
	op, tileSizes, {}, pipeline,
	workgroupSize);
	}

	static LogicalResult setOpConfig(spirv::ResourceLimitsAttr limits,
	linalg::MatmulOp op) {
	unsigned maxWorkgroupSize =
	limits.max_compute_workgroup_invocations().getInt();

	std::array<int64_t, 3> workgroupSize = {1, 1, 1};
	std::tie(workgroupSize[0], workgroupSize[1]) =
	distributeProcs2D(maxWorkgroupSize);

	const int numRowsPerThread = 1;
	const int numColsPerThread = 1;
	int64_t tileSizeK = 0;

	ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
	ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);

	int64_t M = lhsShape[0];
	int64_t N = rhsShape[1];
	int64_t K = lhsShape[1];

	auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute;

	TileSizesListType tileSizes;
	// Workgroup level.
	tileSizes.push_back(
	{getMinIfStaticShape(M, numRowsPerThread * workgroupSize[1]),
	getMinIfStaticShape(N, numColsPerThread * workgroupSize[0]),
	getMinIfStaticShape(K, tileSizeK)});
	// No tiling at the subgroup level since this target doesn't use subgroup op
	// or shared memory.
	tileSizes.emplace_back();
	// Invocation level.
	tileSizes.push_back({1, 1, 0});

	return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
	op, tileSizes, {}, pipeline,
	workgroupSize);
	}

	//===----------------------------------------------------------------------===//
	// Default Configuration
	//===----------------------------------------------------------------------===//

	static LogicalResult setDefaultOpConfig(spirv::ResourceLimitsAttr limits,
	Operation *op) {
	auto partitionedLoops = getPartitionedLoops(op);
	if (partitionedLoops.empty()) {
	auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
	std::array<int64_t, 3> workgroupSize = {1, 1, 1};
	auto funcOp = op->getParentOfType<FuncOp>();
	return setOpConfigAndEntryPointFnTranslation(funcOp, op, {}, {}, pipeline,
	workgroupSize);
	}

	const int64_t subgroupSize = limits.subgroup_size().getValue().getSExtValue();
	int64_t numElementsPerWorkgroup = subgroupSize;
	int64_t numElementsPerThread = 1;
	auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute;

	// Returns true if the given `operand` has 32-bit element type.
	auto has32BitElementType = [](Value operand) {
	auto shapedType = operand.getType().dyn_cast<ShapedType>();
	Type elementType =
	(shapedType ? shapedType.getElementType() : operand.getType());
	return elementType.isa<FloatType>() \|\| elementType.isInteger(32);
	};

	if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
	bool vectorize = false;
	auto outputShape = getUntiledResultShape(linalgOp, 0);

	if (!linalgOp.hasIndexSemantics() &&
	// Skip vectorization for non-minor identity inputs as it generates
	// vector.transfer_read ops with permutation maps that we currently
	// cannot lower.
	// TODO: Remove this restriction once the lowering of the permutation
	// map is supported in core.
	llvm::all_of(linalgOp.getIndexingMaps(),
	[](AffineMap &map) { return map.isMinorIdentity(); }) &&
	// TODO(thomasraoux): Lowering of integers other than i32 may require
	// emulation. This is currently not supported for vector operation.
	// Re-enable this when the bug is fixed on SPIR-V lowering side.
	llvm::all_of(linalgOp->getOperands(), has32BitElementType) &&
	llvm::all_of(outputShape,
	[](int64_t dim) { return !ShapedType::isDynamic(dim); })) {
	vectorize = true;
	}

	SmallVector<int64_t, 4> candidateTileSizes;
	if (vectorize) candidateTileSizes.push_back(4 * subgroupSize);
	candidateTileSizes.push_back(subgroupSize);

	for (int64_t size : candidateTileSizes) {
	if (outputShape.back() % size != 0) continue;
	numElementsPerWorkgroup = size;
	break;
	}

	if (numElementsPerWorkgroup <= subgroupSize \|\|
	outputShape.back() % numElementsPerWorkgroup != 0) {
	vectorize = false;
	}

	if (vectorize) {
	numElementsPerThread = numElementsPerWorkgroup / subgroupSize;
	pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
	}
	}

	std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1};

	unsigned loopDepth = partitionedLoops.back() + 1;
	SmallVector<int64_t, 4> workgroupTileSize(loopDepth, 0);
	SmallVector<int64_t, 4> threadTileSize(loopDepth, 0);

	// Tiling along partitioned loops with size 1.
	for (int64_t loopIndex : partitionedLoops) {
	workgroupTileSize[loopIndex] = threadTileSize[loopIndex] = 1;
	}
	// Overwrite the configuration for the innermost dimension.
	workgroupTileSize.back() = numElementsPerWorkgroup;
	threadTileSize.back() = numElementsPerThread;

	TileSizesListType tileSizes;
	tileSizes.push_back(workgroupTileSize);
	tileSizes.emplace_back(); // Subgroup level.
	tileSizes.push_back(threadTileSize);

	return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
	op, tileSizes, {}, pipeline,
	workgroupSize);
	}

	/// Sets the CodeGen configuration as attributes to the given `rootOp` if it's a
	/// known Linalg matmul/convolution op with good configurations.
	static LogicalResult setSPIRVOpConfig(const spirv::TargetEnv &targetEnv,
	Operation *rootOp) {
	LogicalResult result = success();
	// First try to find a proper CodeGen configuration for the current
	// target architecture.
	switch (targetEnv.getVendorID()) {
	case spirv::Vendor::ARM:
	result = detail::setMaliCodeGenConfig(targetEnv, rootOp);
	break;
	case spirv::Vendor::NVIDIA:
	result = detail::setNVIDIACodeGenConfig(targetEnv, rootOp);
	break;
	case spirv::Vendor::Qualcomm:
	result = detail::setAdrenoCodeGenConfig(targetEnv, rootOp);
	break;
	default:
	break;
	}

	if (failed(result)) return result;
	// Check whether there is actually a configuration found. If so, it's done.
	if (getLoweringConfig(rootOp)) return result;

	// Otherwise fallback to use a default configuration.
	spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
	return TypeSwitch<Operation *, LogicalResult>(rootOp)
	.Case<linalg::BatchMatmulOp, linalg::MatmulOp>(
	[limits](auto op) { return setOpConfig(limits, op); })
	.Case<linalg::Conv2DNhwcHwcfOp, linalg::DepthwiseConv2DNhwOp>(
	[limits](auto op) { return setDefaultOpConfig(limits, op); })
	.Default([](Operation *) { return success(); });
	};

	//===----------------------------------------------------------------------===//
	// Entry Point
	//===----------------------------------------------------------------------===//

	LogicalResult initSPIRVLaunchConfig(ModuleOp module) {
	llvm::StringMap<IREE::HAL::ExecutableEntryPointOp> entryPointOps =
	getAllEntryPoints(module);
	spirv::TargetEnvAttr targetEnvAttr = getSPIRVTargetEnvAttr(module);
	if (!targetEnvAttr) {
	return module.emitOpError(
	"expected parent hal.executable.variant to have spv.target_env "
	"attribute");
	}
	spirv::TargetEnv targetEnv(targetEnvAttr);
	spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();

	for (auto funcOp : module.getOps<FuncOp>()) {
	auto entryPointOp = entryPointOps.lookup(funcOp.getName());
	if (!entryPointOp) continue;
	if (getTranslationInfo(entryPointOp)) continue;

	SmallVector<Operation *, 4> computeOps;
	SmallVector<Operation *, 4> tiledLoops;
	if (failed(getComputeOps(funcOp, computeOps, tiledLoops))) {
	return funcOp.emitOpError("failed to get compute ops");
	}

	int64_t subgroupSize =
	targetEnv.getResourceLimits().subgroup_size().getValue().getSExtValue();

	// If the dispatch region does not contain tiled and distributed Linalg ops,
	// invoke the pipeline to distribute to global invocations.
	if (tiledLoops.empty() && llvm::none_of(computeOps, [](Operation *op) {
	return hasMarker(op, getWorkgroupMarker());
	})) {
	std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1};
	if (failed(
	setTranslationUsingDistributeToGlobalId(funcOp, workgroupSize))) {
	return computeOps[0]->emitOpError(
	"failed to set translation info for distributing to global IDs");
	}
	continue;
	}

	Operation *rootOperation = nullptr;

	// Try to find a configuration according to a matmul/convolution op and use
	// it as the root op.
	for (Operation *computeOp : computeOps) {
	if (failed(setSPIRVOpConfig(targetEnv, computeOp))) return failure();

	// Check if the op configuration was set.
	if (!getLoweringConfig(computeOp)) continue;

	if (rootOperation) {
	return computeOp->emitOpError(
	"unhandled multiple roots in dispatch region");
	}
	rootOperation = computeOp;
	}

	// If there are still no root op, check for any linalg.generic op.
	if (!rootOperation) {
	for (Operation *computeOp : reverse(computeOps)) {
	if (failed(setDefaultOpConfig(limits, computeOp))) return failure();

	// Check if the op configuration was set.
	if (!getLoweringConfig(computeOp)) {
	return computeOp->emitOpError(
	"without known roots, the last operation in the tiled loop body "
	"is expected to be set as root");
	}
	rootOperation = computeOp;
	break;
	}
	}

	if (!rootOperation) {
	// If the tiled loops are not empty then this could be a corner case of
	// tensor.insert_slice being tiled and distributed, that just shows up as
	// a `flow.dispatch.tensor.load` and a `flow.dispatch.tensor.store` (or as
	// a copy. For now just treat the tiled loops not being empty as an
	// indicator of that. Need a better way of information flow from flow
	// dialect to hal.
	if (!tiledLoops.empty()) {
	const int64_t subgroupSize =
	limits.subgroup_size().getValue().getSExtValue();
	std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1};
	SmallVector<int64_t> workloadPerWorkgroup(tiledLoops.size(), 1);
	workloadPerWorkgroup.front() = subgroupSize * 4;
	setTranslationInfo(
	funcOp, IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute,
	workgroupSize, workloadPerWorkgroup);
	return success();
	}
	return funcOp.emitError("contains no root Linalg operation");
	}

	// Propogate the `lowering.config` attribute to the other ops.
	// TODO(ravishankarm, antiagainst): This is a very specific use (and
	// fragile). In general, this should not be needed. Things are already tiled
	// and distributed. The rest of the compilation must be structured to either
	// use `TileAndFuse` or they are independent configurations that are
	// determined based on the op.
	IREE::HAL::LoweringConfig config = getLoweringConfig(rootOperation);
	for (auto op : computeOps) {
	if (op == rootOperation) continue;
	setLoweringConfig(op, config);
	}
	}
	return success();
	}

	} // namespace iree_compiler
	} // namespace mlir