iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2021 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/compiler/Codegen/LLVMGPU/KernelConfig.h"

 #include <numeric>

 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"

 using namespace mlir;
 using namespace mlir::iree_compiler;

 static constexpr unsigned cudaWarpSize = 32;

 namespace {
 struct TileWorkgroupSizePair {
   // How many scalar elements each workgroup should handle along each dimension.
   std::array<int64_t, 3> tileSize;
   std::array<int64_t, 3> workgroupSize;
 };
 }  // namespace

 /// Return the best combination of tile size and wg size. It will then used to
 /// pick the best size aligned with the shape dimension.
 static void getMatmulConfig(SmallVectorImpl<TileWorkgroupSizePair> &tileSizes) {
   // Pick tile size so that M*K and K*N dividible by wgSize * \*vecSize=*\4.
   // This way workgroup memory copy don't need to be masked. Once we support
   // masked load we can get performance out of more configuration.
   tileSizes.push_back(TileWorkgroupSizePair({{32, 128, 32}, {32, 8, 1}}));
   tileSizes.push_back(TileWorkgroupSizePair({{128, 64, 8}, {16, 8, 1}}));
   tileSizes.push_back(TileWorkgroupSizePair({{16, 256, 32}, {64, 2, 1}}));

   tileSizes.push_back(TileWorkgroupSizePair({{8, 128, 4}, {32, 1, 1}}));
   tileSizes.push_back(TileWorkgroupSizePair({{16, 64, 4}, {16, 2, 1}}));
   tileSizes.push_back(TileWorkgroupSizePair({{1, 128, 8}, {32, 1, 1}}));
 }

 static LogicalResult setContractConfig(FuncOp entryPoint, linalg::LinalgOp op) {
   TileSizesListType tileSizes;
   // Infer the MxN size of the matmul based on operands and indexing maps.
   auto lhsShape = getUntiledShape(op.getInputOperand(0)->get());
   auto rhsShape = getUntiledShape(op.getInputOperand(1)->get());
   int64_t sizeM = ShapedType::kDynamicSize;
   int64_t sizeN = ShapedType::kDynamicSize;
   auto outputMap = op.getTiedIndexingMap(op.getOutputOperand(0));
   for (unsigned i = 0; i < lhsShape.size(); i++) {
     if (op.getTiedIndexingMap(op.getInputOperand(0)).getDimPosition(i) ==
         outputMap.getDimPosition(outputMap.getNumResults() - 2)) {
       sizeM = lhsShape[i];
       break;
     }
   }
   for (unsigned i = 0; i < rhsShape.size(); i++) {
     if (op.getTiedIndexingMap(op.getInputOperand(1)).getDimPosition(i) ==
         outputMap.getDimPosition(outputMap.getNumResults() - 1)) {
       sizeN = rhsShape[i];
       break;
     }
   }
   // Default tile size and workgroup size.
   int64_t tileX = 2;
   int64_t tileY = 256;
   int64_t tileK = 4;
   SmallVector<int64_t, 3> workgroupSize = {2 * cudaWarpSize, 1, 1};
   bool isStaticSize =
       sizeM != ShapedType::kDynamicSize && sizeN != ShapedType::kDynamicSize;
   if (isStaticSize) {
     // Special case for very small matrices.
     if (sizeM * sizeN <= cudaWarpSize) {
       tileX = sizeN;
       tileY = sizeM;
       workgroupSize = {sizeM, sizeN, 1};
     }
     SmallVector<TileWorkgroupSizePair> tileSizeConfig;
     // Query the best configuration.
     getMatmulConfig(tileSizeConfig);
     // Pick the best configuration where the original shape is aligned on the
     // tile size.
     for (TileWorkgroupSizePair &config : tileSizeConfig) {
       if (sizeN % config.tileSize[1] == 0 && sizeM % config.tileSize[0] == 0) {
         tileX = config.tileSize[0];
         tileY = config.tileSize[1];
         tileK = config.tileSize[2];
         workgroupSize.assign(config.workgroupSize.begin(),
                              config.workgroupSize.end());
         break;
       }
     }
   }
   // Currently just a basic tile size to enable tiling and vectorization.
   // TODO: pick a more efficient tile size and tile at subgroup level.
   SmallVector<int64_t, 4> ts;
   // Tile all the higher parallel dimension with a size of 1 and the 2 most
   // inner dimension with the tileX/tileY size.
   ts.append(op.getNumParallelLoops() - 2, 1);
   ts.append({tileX, tileY});
   // Tile all the reduction dimensions.
   ts.append(op.getNumReductionLoops(), tileK);
   tileSizes.push_back(ts);  // Workgroup level.
   return setOpConfigAndEntryPointFnTranslation(
       entryPoint, op, tileSizes, /*nativeVectorSizes=*/ArrayRef<int64_t>{},
       IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUMatmulSimt,
       workgroupSize);
 }

 static LogicalResult setFftConfig(FuncOp entryPoint, linalg_ext::FftOp op) {
   auto partitionedLoops = getPartitionedLoops(op);
   unsigned loopDepth = partitionedLoops.back() + 1;
   SmallVector<int64_t, 4> workgroupTileSize(loopDepth, 0);
   SmallVector<int64_t, 3> workgroupSize = {cudaWarpSize, 1, 1};

   // Tiling along partitioned loops with size 1.
   for (int64_t loopIndex : partitionedLoops) {
     workgroupTileSize[loopIndex] = 1;
   }
   auto rank = op.getOperandRank();
   if (workgroupTileSize.size() >= rank && workgroupTileSize[rank - 1] != 0) {
     APInt value;
     if (matchPattern(op.getStage(), m_ConstantInt(&value))) {
       workgroupTileSize[rank - 1] = 1 << value.getSExtValue();
     } else {
       op.emitError("non-constant stage might not work for fft op");
       return failure();
     }
   }
   TileSizesListType tileSizes = {workgroupTileSize};
   return setOpConfigAndEntryPointFnTranslation(
       entryPoint, op, tileSizes, /*nativeVectorSizes=*/ArrayRef<int64_t>{},
       IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUDistribute,
       workgroupSize);
 }

 // Basic default properties for linalg ops that haven't been tuned.
 static LogicalResult setRootDefaultConfig(FuncOp entryPoint, Operation *op) {
   IREE::HAL::DispatchLoweringPassPipeline passPipeline =
       IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUDistribute;
   TileSizesListType tileSizes;
   SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
   if (partitionedLoops.empty()) {
     tileSizes.push_back({});
     return setOpConfigAndEntryPointFnTranslation(
         entryPoint, op, tileSizes, /*nativeVectorSizes=*/ArrayRef<int64_t>{},
         passPipeline, {1, 1, 1});
   }

   size_t numLoops = partitionedLoops.back() + 1;

   std::array<int64_t, 3> workgroupSize = {cudaWarpSize, 1, 1};
   unsigned vectorSize = 4;
   SmallVector<int64_t, 4> workgroupTileSizes(numLoops, 1);
   // Set all non-parallel loops to zero tile size.
   llvm::DenseSet<unsigned> partitionedLoopsSet(partitionedLoops.begin(),
                                                partitionedLoops.end());
   for (auto depth : llvm::seq<int64_t>(0, numLoops)) {
     if (!partitionedLoopsSet.count(depth)) {
       workgroupTileSizes[depth] = 0;
     }
   }

   if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) {
     for (auto outputOperand : enumerate(genericOp.getOutputOperands())) {
       if (!genericOp.getTiedIndexingMap(outputOperand.value())
                .isProjectedPermutation()) {
         vectorSize = 1;
         break;
       }
       ArrayRef<int64_t> shape = getUntiledResultShape(
           cast<linalg::LinalgOp>(op), outputOperand.index());
       if (llvm::any_of(shape, ShapedType::isDynamic)) {
         vectorSize = 1;
         break;
       }
       int64_t problemSize = std::accumulate(
           shape.begin(), shape.end(), 1,
           [](const int64_t &a, const int64_t &b) { return a * b; });
       if ((problemSize / (cudaWarpSize * vectorSize)) < 64) {
         vectorSize = 1;
         break;
       }
     }
   }
   // Pick a vectorSize of 1 for op that we know won't get vectorizedd.
   // TODO(thomasraoux): This could be improved by checking if the linalg op
   // would fail vectorization.
   if (!isa<linalg::LinalgOp>(op)) vectorSize = 1;

   // Set the inner most parallel loop to `lowerTs`.
   for (int64_t depth = numLoops; depth > 0; depth--) {
     if (partitionedLoopsSet.count(depth - 1)) {
       workgroupTileSizes[depth - 1] = cudaWarpSize * vectorSize;
       break;
     }
   }
   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
     // Tile reduction dimension to 1. Using a large tile size may allow better
     // scheduling and could help in case one of the input has transpose.
     // TODO(thomasraoux): improve the heuristic.
     workgroupTileSizes.append(linalgOp.getNumReductionLoops(), 1);
   }
   tileSizes.emplace_back(std::move(workgroupTileSizes));  // Workgroup level
   return setOpConfigAndEntryPointFnTranslation(
       entryPoint, op, tileSizes, /*nativeVectorSizes=*/ArrayRef<int64_t>{},
       IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUVectorize, workgroupSize);
 }

 /// Propagate the configuration annotated in the incoming IR.
 static LogicalResult setUserConfig(FuncOp entryPointFn, Operation *computeOp,
                                    IREE::HAL::LoweringConfig config) {
   IREE::HAL::DispatchLoweringPassPipeline passPipeline =
       IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUVectorize;
   if (auto setPassPipeline = getLoweringPassPipeline(config)) {
     passPipeline = setPassPipeline.getValue();
   }
   SmallVector<int64_t, 4> workgroupSize;
   if (auto workgroupSizeAttr = config.workgroupSize()) {
     workgroupSize = llvm::to_vector<4>(
         llvm::map_range(workgroupSizeAttr, [](Attribute intAttr) {
           return intAttr.cast<IntegerAttr>().getInt();
         }));
   }
   if (failed(setOpConfigAndEntryPointFnTranslation(
           entryPointFn, computeOp, config, passPipeline, workgroupSize))) {
     return failure();
   }
   // Reset the op configuration to drop the pass-pipeline and workgroup size
   // info. The op does not carry that information anymore.
   auto resetConfig = IREE::HAL::LoweringConfig::get(
       config.tileSizes(), config.nativeVectorSize(),
       /*passPipeline =*/nullptr,
       /*workgroupSize =*/nullptr, computeOp->getContext());
   setLoweringConfig(computeOp, resetConfig);
   return success();
 }

 static LogicalResult setRootConfig(FuncOp entryPointFn, Operation *computeOp) {
   if (IREE::HAL::LoweringConfig config = getLoweringConfig(computeOp)) {
     // If the op already has a lowering config coming from the IR use this and
     // bypass the heuristic.
     return setUserConfig(entryPointFn, computeOp, config);
   }
   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) {
     if (linalg::isaContractionOpInterface(linalgOp) &&
         linalgOp.getNumParallelLoops() >= 2) {
       return setContractConfig(entryPointFn, linalgOp);
     }
   }
   if (auto fftOp = dyn_cast<linalg_ext::FftOp>(computeOp)) {
     return setFftConfig(entryPointFn, fftOp);
   }
   return setRootDefaultConfig(entryPointFn, computeOp);
 }

 namespace mlir {
 namespace iree_compiler {

 LogicalResult initGPULaunchConfig(ModuleOp moduleOp) {
   llvm::StringMap<IREE::HAL::ExecutableEntryPointOp> entryPointOps =
       getAllEntryPoints(moduleOp);

   for (auto funcOp : moduleOp.getOps<FuncOp>()) {
     auto entryPointOp = entryPointOps.lookup(funcOp.getName());
     if (!entryPointOp) continue;
     if (getTranslationInfo(entryPointOp)) continue;
     SmallVector<Operation *> computeOps;
     SmallVector<TiledLoopInfo> tiledLoops;
     if (failed(getComputeOps(funcOp, computeOps, tiledLoops))) {
       return funcOp.emitOpError("failed to get compute ops");
     }

     if (computeOps.empty()) {
       std::array<int64_t, 3> workgroupSize = {1, 1, 1};
       SmallVector<int64_t> workloadPerWorkgroup;
       if (!tiledLoops.empty()) {
         // If the tiled loops are not empty then this could be a corner case of
         // tensor.insert_slice being tiled and distributed, that just shows up
         // as a `flow.dispatch.tensor.load` and a `flow.dispatch.tensor.store`.
         // For now just treat the tiled loops not being empty as an indicator of
         // that. Need a better way of information flow from flow dialect to hal.
         workgroupSize[0] = cudaWarpSize;
         workloadPerWorkgroup.resize(tiledLoops.size(), 1);
         workloadPerWorkgroup.front() = cudaWarpSize * 4;
       }
       // TODO(ravishankarm): Maybe this should just return without setting
       // anything. Without any compute ops, this shouldnt be using tile and
       // distribute.
       setTranslationInfo(
           funcOp, IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUDistribute,
           workgroupSize, workloadPerWorkgroup);
       continue;
     }

     Operation *rootOperation = nullptr;
     // Find the root operation. linalg.generic, linalg.fill and linalg.copy are
     // not root operations if there are other compute operations present.
     for (Operation *op : llvm::reverse(computeOps)) {
       if (!isa<linalg::GenericOp, linalg::FillOp, linalg::CopyOp>(op)) {
         rootOperation = op;
         break;
       }
     }

     if (!rootOperation) {
       for (Operation *op : llvm::reverse(computeOps)) {
         if (isa<linalg::GenericOp, linalg::FillOp, linalg::CopyOp>(op)) {
           rootOperation = op;
           break;
         }
       }
     }

     if (!rootOperation) {
       // TODO(ravishankarm): Maybe this should just return without setting
       // anything. Without any compute ops, this shouldnt be using tile and
       // distribute.
       setTranslationInfo(
           funcOp, IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUDistribute,
           {1, 1, 1}, /*workloadPerWorkgroup=*/{});
       continue;
     }
     if (failed(setRootConfig(funcOp, rootOperation))) continue;

     // Propogate the configuration to the other ops.
     // TODO(ravishankarm, thomasraoux): This is a very specific use (and
     // fragile). In general, this should not be needed. Things are already tiled
     // and distributed. The rest of the compilation must be structured to either
     // use `TileAndFuse` or they are independent configurations that are
     // determined based on the op.
     IREE::HAL::LoweringConfig config = getLoweringConfig(rootOperation);
     for (auto op : computeOps) {
       if (op == rootOperation) continue;
       setLoweringConfig(op, config);
     }
   }
   return success();
 }

 }  // namespace iree_compiler
 }  // namespace mlir
	// Copyright 2021 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	#include "iree/compiler/Codegen/LLVMGPU/KernelConfig.h"

	#include <numeric>

	#include "iree/compiler/Codegen/Utils/Utils.h"
	#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
	#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
	#include "llvm/Support/Debug.h"
	#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
	#include "mlir/IR/Matchers.h"
	#include "mlir/IR/Types.h"
	#include "mlir/IR/Value.h"

	using namespace mlir;
	using namespace mlir::iree_compiler;

	static constexpr unsigned cudaWarpSize = 32;

	namespace {
	struct TileWorkgroupSizePair {
	// How many scalar elements each workgroup should handle along each dimension.
	std::array<int64_t, 3> tileSize;
	std::array<int64_t, 3> workgroupSize;
	};
	} // namespace

	/// Return the best combination of tile size and wg size. It will then used to
	/// pick the best size aligned with the shape dimension.
	static void getMatmulConfig(SmallVectorImpl<TileWorkgroupSizePair> &tileSizes) {
	// Pick tile size so that MK and KN dividible by wgSize * \vecSize=\4.
	// This way workgroup memory copy don't need to be masked. Once we support
	// masked load we can get performance out of more configuration.
	tileSizes.push_back(TileWorkgroupSizePair({{32, 128, 32}, {32, 8, 1}}));
	tileSizes.push_back(TileWorkgroupSizePair({{128, 64, 8}, {16, 8, 1}}));
	tileSizes.push_back(TileWorkgroupSizePair({{16, 256, 32}, {64, 2, 1}}));

	tileSizes.push_back(TileWorkgroupSizePair({{8, 128, 4}, {32, 1, 1}}));
	tileSizes.push_back(TileWorkgroupSizePair({{16, 64, 4}, {16, 2, 1}}));
	tileSizes.push_back(TileWorkgroupSizePair({{1, 128, 8}, {32, 1, 1}}));
	}

	static LogicalResult setContractConfig(FuncOp entryPoint, linalg::LinalgOp op) {
	TileSizesListType tileSizes;
	// Infer the MxN size of the matmul based on operands and indexing maps.
	auto lhsShape = getUntiledShape(op.getInputOperand(0)->get());
	auto rhsShape = getUntiledShape(op.getInputOperand(1)->get());
	int64_t sizeM = ShapedType::kDynamicSize;
	int64_t sizeN = ShapedType::kDynamicSize;
	auto outputMap = op.getTiedIndexingMap(op.getOutputOperand(0));
	for (unsigned i = 0; i < lhsShape.size(); i++) {
	if (op.getTiedIndexingMap(op.getInputOperand(0)).getDimPosition(i) ==
	outputMap.getDimPosition(outputMap.getNumResults() - 2)) {
	sizeM = lhsShape[i];
	break;
	}
	}
	for (unsigned i = 0; i < rhsShape.size(); i++) {
	if (op.getTiedIndexingMap(op.getInputOperand(1)).getDimPosition(i) ==
	outputMap.getDimPosition(outputMap.getNumResults() - 1)) {
	sizeN = rhsShape[i];
	break;
	}
	}
	// Default tile size and workgroup size.
	int64_t tileX = 2;
	int64_t tileY = 256;
	int64_t tileK = 4;
	SmallVector<int64_t, 3> workgroupSize = {2 * cudaWarpSize, 1, 1};
	bool isStaticSize =
	sizeM != ShapedType::kDynamicSize && sizeN != ShapedType::kDynamicSize;
	if (isStaticSize) {
	// Special case for very small matrices.
	if (sizeM * sizeN <= cudaWarpSize) {
	tileX = sizeN;
	tileY = sizeM;
	workgroupSize = {sizeM, sizeN, 1};
	}
	SmallVector<TileWorkgroupSizePair> tileSizeConfig;
	// Query the best configuration.
	getMatmulConfig(tileSizeConfig);
	// Pick the best configuration where the original shape is aligned on the
	// tile size.
	for (TileWorkgroupSizePair &config : tileSizeConfig) {
	if (sizeN % config.tileSize[1] == 0 && sizeM % config.tileSize[0] == 0) {
	tileX = config.tileSize[0];
	tileY = config.tileSize[1];
	tileK = config.tileSize[2];
	workgroupSize.assign(config.workgroupSize.begin(),
	config.workgroupSize.end());
	break;
	}
	}
	}
	// Currently just a basic tile size to enable tiling and vectorization.
	// TODO: pick a more efficient tile size and tile at subgroup level.
	SmallVector<int64_t, 4> ts;
	// Tile all the higher parallel dimension with a size of 1 and the 2 most
	// inner dimension with the tileX/tileY size.
	ts.append(op.getNumParallelLoops() - 2, 1);
	ts.append({tileX, tileY});
	// Tile all the reduction dimensions.
	ts.append(op.getNumReductionLoops(), tileK);
	tileSizes.push_back(ts); // Workgroup level.
	return setOpConfigAndEntryPointFnTranslation(
	entryPoint, op, tileSizes, /nativeVectorSizes=/ArrayRef<int64_t>{},
	IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUMatmulSimt,
	workgroupSize);
	}

	static LogicalResult setFftConfig(FuncOp entryPoint, linalg_ext::FftOp op) {
	auto partitionedLoops = getPartitionedLoops(op);
	unsigned loopDepth = partitionedLoops.back() + 1;
	SmallVector<int64_t, 4> workgroupTileSize(loopDepth, 0);
	SmallVector<int64_t, 3> workgroupSize = {cudaWarpSize, 1, 1};

	// Tiling along partitioned loops with size 1.
	for (int64_t loopIndex : partitionedLoops) {
	workgroupTileSize[loopIndex] = 1;
	}
	auto rank = op.getOperandRank();
	if (workgroupTileSize.size() >= rank && workgroupTileSize[rank - 1] != 0) {
	APInt value;
	if (matchPattern(op.getStage(), m_ConstantInt(&value))) {
	workgroupTileSize[rank - 1] = 1 << value.getSExtValue();
	} else {
	op.emitError("non-constant stage might not work for fft op");
	return failure();
	}
	}
	TileSizesListType tileSizes = {workgroupTileSize};
	return setOpConfigAndEntryPointFnTranslation(
	entryPoint, op, tileSizes, /nativeVectorSizes=/ArrayRef<int64_t>{},
	IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUDistribute,
	workgroupSize);
	}

	// Basic default properties for linalg ops that haven't been tuned.
	static LogicalResult setRootDefaultConfig(FuncOp entryPoint, Operation *op) {
	IREE::HAL::DispatchLoweringPassPipeline passPipeline =
	IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUDistribute;
	TileSizesListType tileSizes;
	SmallVector<unsigned> partitionedLoops = getPartitionedLoops(op);
	if (partitionedLoops.empty()) {
	tileSizes.push_back({});
	return setOpConfigAndEntryPointFnTranslation(
	entryPoint, op, tileSizes, /nativeVectorSizes=/ArrayRef<int64_t>{},
	passPipeline, {1, 1, 1});
	}

	size_t numLoops = partitionedLoops.back() + 1;

	std::array<int64_t, 3> workgroupSize = {cudaWarpSize, 1, 1};
	unsigned vectorSize = 4;
	SmallVector<int64_t, 4> workgroupTileSizes(numLoops, 1);
	// Set all non-parallel loops to zero tile size.
	llvm::DenseSet<unsigned> partitionedLoopsSet(partitionedLoops.begin(),
	partitionedLoops.end());
	for (auto depth : llvm::seq<int64_t>(0, numLoops)) {
	if (!partitionedLoopsSet.count(depth)) {
	workgroupTileSizes[depth] = 0;
	}
	}

	if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) {
	for (auto outputOperand : enumerate(genericOp.getOutputOperands())) {
	if (!genericOp.getTiedIndexingMap(outputOperand.value())
	.isProjectedPermutation()) {
	vectorSize = 1;
	break;
	}
	ArrayRef<int64_t> shape = getUntiledResultShape(
	cast<linalg::LinalgOp>(op), outputOperand.index());
	if (llvm::any_of(shape, ShapedType::isDynamic)) {
	vectorSize = 1;
	break;
	}
	int64_t problemSize = std::accumulate(
	shape.begin(), shape.end(), 1,
	[](const int64_t &a, const int64_t &b) { return a * b; });
	if ((problemSize / (cudaWarpSize * vectorSize)) < 64) {
	vectorSize = 1;
	break;
	}
	}
	}
	// Pick a vectorSize of 1 for op that we know won't get vectorizedd.
	// TODO(thomasraoux): This could be improved by checking if the linalg op
	// would fail vectorization.
	if (!isa<linalg::LinalgOp>(op)) vectorSize = 1;

	// Set the inner most parallel loop to `lowerTs`.
	for (int64_t depth = numLoops; depth > 0; depth--) {
	if (partitionedLoopsSet.count(depth - 1)) {
	workgroupTileSizes[depth - 1] = cudaWarpSize * vectorSize;
	break;
	}
	}
	if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
	// Tile reduction dimension to 1. Using a large tile size may allow better
	// scheduling and could help in case one of the input has transpose.
	// TODO(thomasraoux): improve the heuristic.
	workgroupTileSizes.append(linalgOp.getNumReductionLoops(), 1);
	}
	tileSizes.emplace_back(std::move(workgroupTileSizes)); // Workgroup level
	return setOpConfigAndEntryPointFnTranslation(
	entryPoint, op, tileSizes, /nativeVectorSizes=/ArrayRef<int64_t>{},
	IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUVectorize, workgroupSize);
	}

	/// Propagate the configuration annotated in the incoming IR.
	static LogicalResult setUserConfig(FuncOp entryPointFn, Operation *computeOp,
	IREE::HAL::LoweringConfig config) {
	IREE::HAL::DispatchLoweringPassPipeline passPipeline =
	IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUVectorize;
	if (auto setPassPipeline = getLoweringPassPipeline(config)) {
	passPipeline = setPassPipeline.getValue();
	}
	SmallVector<int64_t, 4> workgroupSize;
	if (auto workgroupSizeAttr = config.workgroupSize()) {
	workgroupSize = llvm::to_vector<4>(
	llvm::map_range(workgroupSizeAttr, [](Attribute intAttr) {
	return intAttr.cast<IntegerAttr>().getInt();
	}));
	}
	if (failed(setOpConfigAndEntryPointFnTranslation(
	entryPointFn, computeOp, config, passPipeline, workgroupSize))) {
	return failure();
	}
	// Reset the op configuration to drop the pass-pipeline and workgroup size
	// info. The op does not carry that information anymore.
	auto resetConfig = IREE::HAL::LoweringConfig::get(
	config.tileSizes(), config.nativeVectorSize(),
	/passPipeline =/nullptr,
	/workgroupSize =/nullptr, computeOp->getContext());
	setLoweringConfig(computeOp, resetConfig);
	return success();
	}

	static LogicalResult setRootConfig(FuncOp entryPointFn, Operation *computeOp) {
	if (IREE::HAL::LoweringConfig config = getLoweringConfig(computeOp)) {
	// If the op already has a lowering config coming from the IR use this and
	// bypass the heuristic.
	return setUserConfig(entryPointFn, computeOp, config);
	}
	if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) {
	if (linalg::isaContractionOpInterface(linalgOp) &&
	linalgOp.getNumParallelLoops() >= 2) {
	return setContractConfig(entryPointFn, linalgOp);
	}
	}
	if (auto fftOp = dyn_cast<linalg_ext::FftOp>(computeOp)) {
	return setFftConfig(entryPointFn, fftOp);
	}
	return setRootDefaultConfig(entryPointFn, computeOp);
	}

	namespace mlir {
	namespace iree_compiler {

	LogicalResult initGPULaunchConfig(ModuleOp moduleOp) {
	llvm::StringMap<IREE::HAL::ExecutableEntryPointOp> entryPointOps =
	getAllEntryPoints(moduleOp);

	for (auto funcOp : moduleOp.getOps<FuncOp>()) {
	auto entryPointOp = entryPointOps.lookup(funcOp.getName());
	if (!entryPointOp) continue;
	if (getTranslationInfo(entryPointOp)) continue;
	SmallVector<Operation *> computeOps;
	SmallVector<TiledLoopInfo> tiledLoops;
	if (failed(getComputeOps(funcOp, computeOps, tiledLoops))) {
	return funcOp.emitOpError("failed to get compute ops");
	}

	if (computeOps.empty()) {
	std::array<int64_t, 3> workgroupSize = {1, 1, 1};
	SmallVector<int64_t> workloadPerWorkgroup;
	if (!tiledLoops.empty()) {
	// If the tiled loops are not empty then this could be a corner case of
	// tensor.insert_slice being tiled and distributed, that just shows up
	// as a `flow.dispatch.tensor.load` and a `flow.dispatch.tensor.store`.
	// For now just treat the tiled loops not being empty as an indicator of
	// that. Need a better way of information flow from flow dialect to hal.
	workgroupSize[0] = cudaWarpSize;
	workloadPerWorkgroup.resize(tiledLoops.size(), 1);
	workloadPerWorkgroup.front() = cudaWarpSize * 4;
	}
	// TODO(ravishankarm): Maybe this should just return without setting
	// anything. Without any compute ops, this shouldnt be using tile and
	// distribute.
	setTranslationInfo(
	funcOp, IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUDistribute,
	workgroupSize, workloadPerWorkgroup);
	continue;
	}

	Operation *rootOperation = nullptr;
	// Find the root operation. linalg.generic, linalg.fill and linalg.copy are
	// not root operations if there are other compute operations present.
	for (Operation *op : llvm::reverse(computeOps)) {
	if (!isa<linalg::GenericOp, linalg::FillOp, linalg::CopyOp>(op)) {
	rootOperation = op;
	break;
	}
	}

	if (!rootOperation) {
	for (Operation *op : llvm::reverse(computeOps)) {
	if (isa<linalg::GenericOp, linalg::FillOp, linalg::CopyOp>(op)) {
	rootOperation = op;
	break;
	}
	}
	}

	if (!rootOperation) {
	// TODO(ravishankarm): Maybe this should just return without setting
	// anything. Without any compute ops, this shouldnt be using tile and
	// distribute.
	setTranslationInfo(
	funcOp, IREE::HAL::DispatchLoweringPassPipeline::LLVMGPUDistribute,
	{1, 1, 1}, /workloadPerWorkgroup=/{});
	continue;
	}
	if (failed(setRootConfig(funcOp, rootOperation))) continue;

	// Propogate the configuration to the other ops.
	// TODO(ravishankarm, thomasraoux): This is a very specific use (and
	// fragile). In general, this should not be needed. Things are already tiled
	// and distributed. The rest of the compilation must be structured to either
	// use `TileAndFuse` or they are independent configurations that are
	// determined based on the op.
	IREE::HAL::LoweringConfig config = getLoweringConfig(rootOperation);
	for (auto op : computeOps) {
	if (op == rootOperation) continue;
	setLoweringConfig(op, config);
	}
	}
	return success();
	}

	} // namespace iree_compiler
	} // namespace mlir