iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2020 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"

 #include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/MarkerUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "iree/compiler/Dialect/HAL/IR/LoweringConfig.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/CommandLine.h"
 #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"

 namespace mlir {
 namespace iree_compiler {

 // TODO(ravishankarm): This needs to be put in a common place for the CPU and
 // GPU backends to use.
 static llvm::cl::list<unsigned> clLLVMTileSizes(
     "iree-llvm-tile-size",
     llvm::cl::desc("Set tile sizes to use for tiling Linalg operations in "
                    "LLVM code generation"),
     llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated);

 static llvm::cl::opt<int> matmulWorkgroupTileSize(
     "iree-codegen-llvm-matmul-workgroup-size",
     llvm::cl::desc(
         "linalg.matmul tile size for workgroups spliting of M, N dimension"),
     llvm::cl::init(64));
 static llvm::cl::opt<int> matmulL1TileSize(
     "iree-codegen-llvm-matmul-l1-size",
     llvm::cl::desc(
         "linalg.matmul tile size for L1 spliting of M, N, K dimension"),
     llvm::cl::init(32));
 static llvm::cl::opt<int> matmulVectorSize(
     "iree-codegen-llvm-matmul-vector-size",
     llvm::cl::desc("linalg.matmul vector tile size"), llvm::cl::init(4));

 static llvm::cl::opt<int> batchMatmulWorkgroupTileSize(
     "iree-codegen-llvm-batch-matmul-workgroup-size",
     llvm::cl::desc("linalg.batch_matmul tile size for workgroups spliting of "
                    "M, N dimension"),
     llvm::cl::init(32));
 static llvm::cl::opt<int> batchMatmulL1TileSize(
     "iree-codegen-llvm-batch-matmul-l1-size",
     llvm::cl::desc("linalg.batch_matmul tile size for L1 spliting of M, N, K "
                    "dimensions"),
     llvm::cl::init(16));

 static llvm::cl::list<int> mmt4dWorkgroupTileSizes(
     "iree-codegen-llvm-mmt4d-workgroup-tile-sizes",
     llvm::cl::desc("linalg.mmt4d workgroup tile size"), llvm::cl::ZeroOrMore,
     llvm::cl::MiscFlags::CommaSeparated);

 static llvm::cl::list<int> mmt4dL1TileSizes(
     "iree-codegen-llvm-mmt4d-l1-tile-size",
     llvm::cl::desc("linalg.mmt4d L1 tile size"), llvm::cl::ZeroOrMore,
     llvm::cl::MiscFlags::CommaSeparated);

 static llvm::cl::list<int> mmt4dVectorSizes(
     "iree-codegen-llvm-mmt4d-vector-size",
     llvm::cl::desc("linalg.mmt4d vector tile size"), llvm::cl::ZeroOrMore,
     llvm::cl::MiscFlags::CommaSeparated);

 static llvm::cl::opt<int> defaultWorkgroupTileSize(
     "iree-codegen-llvm-generic-ops-workgroup-size",
     llvm::cl::desc(
         "linalg.generic and linalg.indexed_generic workgroup tile size"),
     llvm::cl::init(64));

 static Optional<int64_t> getNativeVectorSize(FuncOp entryPointFn,
                                              Type elementType) {
   Optional<int64_t> nativeVectorSizeInBytes = llvm::None;
   if (auto variantOp =
           entryPointFn->getParentOfType<IREE::HAL::ExecutableVariantOp>()) {
     if (IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.target()) {
       if (auto config = targetAttr.getConfiguration()) {
         if (auto nativeVectorSizeAttr =
                 config.getAs<IntegerAttr>("native_vector_size")) {
           if (int64_t nativeVectorSizeVal = nativeVectorSizeAttr.getInt()) {
             nativeVectorSizeInBytes = nativeVectorSizeVal;
           }
         }
       }
     }
   }
   // TODO(ravishankarm): For now still picking the value from the
   // `iree-codegen-llvm-matmul-vector-size` option to avoid some issues on
   // RISCV-32 side.
   if (nativeVectorSizeInBytes) {
     if (elementType.isIntOrFloat()) {
       unsigned bitWidth = elementType.getIntOrFloatBitWidth() / 8;
       return (*nativeVectorSizeInBytes) / bitWidth;
     }
   }
   return matmulVectorSize.getValue();
 }

 /// Sets the lowering configuration for dispatch region with root op that
 /// implements the contraction operation interface.
 static LogicalResult setRootConfig(
     FuncOp entryPointFn, linalg::ContractionOpInterface contractionOp) {
   if (getLoweringConfig(contractionOp)) return success();
   Type elementType =
       contractionOp.lhs().getType().cast<ShapedType>().getElementType();
   auto vectorSize = getNativeVectorSize(entryPointFn, elementType);
   if (!vectorSize) return success();
   int64_t vectorSizeVal = *vectorSize;

   if (contractionOp.isRowMajorMatmul()) {
     int mWorkgroupSize = matmulWorkgroupTileSize;
     int nWorkgroupSize = matmulWorkgroupTileSize;
     int mL1TileSize = matmulL1TileSize;
     int nL1TileSize = matmulL1TileSize;
     int kL1TileSize = matmulL1TileSize;
     auto lhsShape = getUntiledShape(contractionOp.lhs());
     auto rhsShape = getUntiledShape(contractionOp.rhs());
     if (!vectorSize) return success();
     if (!lhsShape.empty() && !rhsShape.empty()) {
       // Find largest tile size that is a multiple of the vector size.
       auto getTileSize = [vectorSizeVal](int dim, int maxSize) -> int {
         if (dim == ShapedType::kDynamicSize) return maxSize;
         if (dim < vectorSizeVal) return vectorSizeVal;
         for (int i = std::min(maxSize, dim); i > 0; --i) {
           if (dim % i == 0 && i % vectorSizeVal == 0) {
             return i;
           }
         }
         return maxSize;
       };
       mWorkgroupSize = getTileSize(lhsShape[0], mWorkgroupSize);
       nWorkgroupSize = getTileSize(rhsShape[1], nWorkgroupSize);
       mL1TileSize = getTileSize(mWorkgroupSize, mL1TileSize);
       nL1TileSize = getTileSize(nWorkgroupSize, nL1TileSize);
       kL1TileSize = getTileSize(rhsShape[0], kL1TileSize);
     }
     TileSizesListType tileSizes = {
         {mWorkgroupSize, nWorkgroupSize},
         {mL1TileSize, nL1TileSize, kL1TileSize},
         {vectorSizeVal, vectorSizeVal, vectorSizeVal}};
     SmallVector<int64_t, 4> nativeVectorSize = {vectorSizeVal, vectorSizeVal,
                                                 vectorSizeVal};
     return setOpConfigAndEntryPointFnTranslation(
         entryPointFn, contractionOp, tileSizes, nativeVectorSize,
         IREE::HAL::DispatchLoweringPassPipeline::CPUTensorToVectors);
   }
   if (contractionOp.isRowMajorBatchMatmul()) {
     // TODO(ataei, ravishankarm): This should just use the configuration for
     // matmul above. setting the tile size to 1 for all the batch dimensions.
     TileSizesListType tileSizes = {
         {1, batchMatmulWorkgroupTileSize, batchMatmulWorkgroupTileSize},
         {1, batchMatmulL1TileSize, batchMatmulL1TileSize,
          batchMatmulL1TileSize},
         {1, vectorSizeVal, vectorSizeVal, vectorSizeVal}};
     SmallVector<int64_t, 4> nativeVectorSize = {1, vectorSizeVal, vectorSizeVal,
                                                 vectorSizeVal};
     return setOpConfigAndEntryPointFnTranslation(
         entryPointFn, contractionOp, tileSizes, nativeVectorSize,
         IREE::HAL::DispatchLoweringPassPipeline::CPUVectorization);
   }
   return success();
 }

 /// Sets the lowering configuration for dispatch region for linalg.mmt4d root op
 static LogicalResult setRootConfig(FuncOp entryPointFn,
                                    linalg::Mmt4DOp mmt4dOp) {
   // TODO(ataei): These are hand tuned for some performance benchmarks for now,
   // we want to adapt the same strategy as matmul that dynamically sets tile
   // size.
   auto getWorkgroupTileSizes = [&]() -> SmallVector<int64_t> {
     if (!mmt4dWorkgroupTileSizes.empty()) {
       return SmallVector<int64_t>(mmt4dWorkgroupTileSizes.begin(),
                                   mmt4dWorkgroupTileSizes.end());
     }
     return {48, 32};
   };

   auto getL1TileSizes = [&]() -> SmallVector<int64_t> {
     auto lhsShape = getUntiledShape(mmt4dOp.inputs()[0]);
     auto rhsShape = getUntiledShape(mmt4dOp.inputs()[1]);
     int M0 = lhsShape[2];
     int N0 = rhsShape[2];
     int K0 = lhsShape[3];
     if (!mmt4dL1TileSizes.empty()) {
       return SmallVector<int64_t>(mmt4dL1TileSizes.begin(),
                                   mmt4dL1TileSizes.end());
     }
     return {1, 1, 1, M0, N0, K0};
   };

   auto getVectorSizes = [&]() -> SmallVector<int64_t> {
     auto lhsShape = getUntiledShape(mmt4dOp.inputs()[0]);
     auto rhsShape = getUntiledShape(mmt4dOp.inputs()[1]);
     int M0 = lhsShape[2];
     int N0 = rhsShape[2];
     int K0 = lhsShape[3];
     if (!mmt4dVectorSizes.empty()) {
       return SmallVector<int64_t>(mmt4dVectorSizes.begin(),
                                   mmt4dVectorSizes.end());
     }
     return {1, 1, 1, M0, N0, K0};
   };

   SmallVector<int64_t, 4> nativeVectorSize = getVectorSizes();

   TileSizesListType tileSizes = {getWorkgroupTileSizes(), getL1TileSizes(),
                                  nativeVectorSize};

   return setOpConfigAndEntryPointFnTranslation(
       entryPointFn, mmt4dOp, tileSizes, nativeVectorSize,
       IREE::HAL::DispatchLoweringPassPipeline::CPUVectorization);
 }

 /// Sets the lowering configuration for dispatch region for linalg_ext.fft root
 /// op.
 static LogicalResult setRootConfig(FuncOp entryPointFn,
                                    linalg_ext::FftOp fftOp) {
   auto partitionedLoops = getPartitionedLoops(fftOp);
   unsigned maxDepth = partitionedLoops.back() + 1;
   SmallVector<int64_t, 4> workgroupTileSizes(maxDepth,
                                              defaultWorkgroupTileSize);
   llvm::DenseSet<unsigned> partitionedLoopsSet(partitionedLoops.begin(),
                                                partitionedLoops.end());
   for (auto dim : llvm::seq<int64_t>(0, workgroupTileSizes.size())) {
     if (!partitionedLoopsSet.count(dim)) {
       workgroupTileSizes[dim] = 0;
     }
   }

   auto rank = fftOp.getOperandRank();
   if (workgroupTileSizes.size() >= rank && workgroupTileSizes[rank - 1] != 0) {
     APInt value;
     if (matchPattern(fftOp.getStage(), m_ConstantInt(&value))) {
       workgroupTileSizes[rank - 1] = 1 << value.getSExtValue();
       workgroupTileSizes[rank - 1] =
           std::max(workgroupTileSizes[rank - 1],
                    static_cast<int64_t>(defaultWorkgroupTileSize));
     } else {
       fftOp.emitError("non-constant stage might not work for fft op");
       return failure();
     }
   }
   TileSizesListType tileSizes = {workgroupTileSizes};

   return setOpConfigAndEntryPointFnTranslation(
       entryPointFn, fftOp, tileSizes, /*nativeVectorSizes=*/ArrayRef<int64_t>{},
       IREE::HAL::DispatchLoweringPassPipeline::CPUDefault);
 }

 /// Sets the lowering configuration for dispatch region with root op being a
 /// generic op.
 static LogicalResult setDefaultRootConfig(FuncOp entryPointFn, Operation *op) {
   auto partitionedLoops = getPartitionedLoops(op);
   if (partitionedLoops.empty()) {
     // Return success without doing anything. Eventually default will be used.
     return success();
   }
   unsigned maxDepth = partitionedLoops.back() + 1;
   SmallVector<int64_t, 4> workgroupTileSizes(maxDepth,
                                              defaultWorkgroupTileSize);
   llvm::DenseSet<unsigned> partitionedLoopsSet(partitionedLoops.begin(),
                                                partitionedLoops.end());
   for (auto dim : llvm::seq<int64_t>(0, workgroupTileSizes.size())) {
     if (!partitionedLoopsSet.count(dim)) {
       workgroupTileSizes[dim] = 0;
     }
   }
   TileSizesListType tileSizes = {workgroupTileSizes};
   return setOpConfigAndEntryPointFnTranslation(
       entryPointFn, op, tileSizes, /*nativeVectorSizes=*/ArrayRef<int64_t>{},
       IREE::HAL::DispatchLoweringPassPipeline::CPUVectorization);
 }

 /// Finds the root operation in the given list of linalg operations and sets its
 /// configuration. Returns the root operation.
 static LogicalResult setRootConfig(FuncOp entryPointFn,
                                    ArrayRef<Operation *> computeOps) {
   Operation *rootOp = nullptr;
   for (auto computeOp : computeOps) {
     if (!hasMarker(computeOp, getWorkgroupMarker())) continue;

     /// If the op already has a lowering config, then check for whether it
     /// specifies a pass-pipeline and workgroup size as well. If so use those.
     if (auto config = getLoweringConfig(computeOp)) {
       IREE::HAL::DispatchLoweringPassPipeline passPipeline =
           IREE::HAL::DispatchLoweringPassPipeline::CPUDefault;
       if (auto setPassPipeline = getLoweringPassPipeline(config)) {
         passPipeline = setPassPipeline.getValue();
       }
       SmallVector<int64_t, 4> workgroupSize;
       if (auto workgroupSizeAttr = config.workgroupSize()) {
         workgroupSize = llvm::to_vector<4>(
             llvm::map_range(workgroupSizeAttr, [](Attribute intAttr) {
               return intAttr.cast<IntegerAttr>().getInt();
             }));
       }
       if (failed(setOpConfigAndEntryPointFnTranslation(
               entryPointFn, computeOp, config, passPipeline, workgroupSize))) {
         return failure();
       }
       // Reset the op configuration to drop the pass-pipeline and workgroup size
       // info. The op does not carry that information anymore.
       auto resetConfig = IREE::HAL::LoweringConfig::get(
           config.tileSizes(), config.nativeVectorSize(),
           /*passPipeline =*/nullptr,
           /*workgroupSize =*/nullptr, computeOp->getContext());
       setLoweringConfig(computeOp, resetConfig);
     } else {
       auto setRootConfigFn = [&](Operation *op) -> LogicalResult {
         return TypeSwitch<Operation *, LogicalResult>(op)
             .Case<linalg::Mmt4DOp, linalg::ContractionOpInterface,
                   linalg_ext::FftOp>(
                 [&](auto op) { return setRootConfig(entryPointFn, op); })
             .Default([&](Operation *op) { return success(); });
       };
       if (failed(setRootConfigFn(computeOp))) {
         return failure();
       }
     }

     if (getLoweringConfig(computeOp)) {
       if (rootOp) {
         return computeOp->emitError(
             "unhandled multiple roots in dispatch region");
       }
       rootOp = computeOp;
     }
   }

   // If no root operation found, check if the dispatch region contains a single
   // generic op and chose pipeline based on that.
   if (!rootOp) {
     for (auto computeOp : computeOps) {
       if (!hasMarker(computeOp, getWorkgroupMarker())) continue;
       // Ignore fill ops. They never end up in their own dispatch, so are never
       // root ops.
       if (isa<linalg::FillOp>(computeOp)) continue;
       if (failed(setDefaultRootConfig(entryPointFn, computeOp))) {
         return failure();
       }
       if (getLoweringConfig(computeOp)) {
         if (rootOp) {
           return computeOp->emitError(
               "unhandled multiple roots in dispatch region");
         }
         rootOp = computeOp;
       }
     }
   }
   return success();
 }

 LogicalResult initCPULaunchConfig(ModuleOp moduleOp) {
   llvm::StringMap<IREE::HAL::ExecutableEntryPointOp> entryPointOps =
       getAllEntryPoints(moduleOp);
   for (auto funcOp : moduleOp.getOps<FuncOp>()) {
     auto entryPointOp = entryPointOps.lookup(funcOp.getName());
     if (!entryPointOp) continue;
     if (getTranslationInfo(entryPointOp)) continue;
     SmallVector<Operation *, 4> computeOps;
     SmallVector<Operation *, 4> tiledLoops;
     // If there are no linalg ops, not using Linalg based lowering.
     if (succeeded(getComputeOps(funcOp, computeOps, tiledLoops)) &&
         !computeOps.empty()) {
       if (failed(setRootConfig(funcOp, computeOps))) {
         return failure();
       }
     }

     // If the function entry point already doesnt have a lowering info attribute
     // on it, just add the default.
     SmallVector<int64_t> workloadPerWorkgroup;
     if (!tiledLoops.empty()) {
       // If the tiled loops are not empty then this could be a corner case of
       // tensor.insert_slice being tiled and distributed, that just shows up as
       // a `flow.dispatch.tensor.load` and a `flow.dispatch.tensor.store`. For
       // now just treat the tiled loops not being empty as an indicator of
       // that. Need a better way of information flow from flow dialect to hal.
       workloadPerWorkgroup.resize(tiledLoops.size(), defaultWorkgroupTileSize);
     }
     if (!getTranslationInfo(entryPointOp)) {
       setTranslationInfo(funcOp,
                          IREE::HAL::DispatchLoweringPassPipeline::CPUDefault,
                          /*workgroupSize =*/{}, workloadPerWorkgroup);
     }
   }
   return success();
 }

 }  // namespace iree_compiler
 }  // namespace mlir
	// Copyright 2020 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	#include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"

	#include "iree/compiler/Codegen/Transforms/Transforms.h"
	#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
	#include "iree/compiler/Codegen/Utils/Utils.h"
	#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
	#include "iree/compiler/Dialect/HAL/IR/LoweringConfig.h"
	#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
	#include "llvm/ADT/TypeSwitch.h"
	#include "llvm/Support/CommandLine.h"
	#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
	#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
	#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
	#include "mlir/Dialect/MemRef/IR/MemRef.h"
	#include "mlir/Dialect/StandardOps/IR/Ops.h"
	#include "mlir/IR/Matchers.h"
	#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

	namespace mlir {
	namespace iree_compiler {

	// TODO(ravishankarm): This needs to be put in a common place for the CPU and
	// GPU backends to use.
	static llvm::cl::list<unsigned> clLLVMTileSizes(
	"iree-llvm-tile-size",
	llvm::cl::desc("Set tile sizes to use for tiling Linalg operations in "
	"LLVM code generation"),
	llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated);

	static llvm::cl::opt<int> matmulWorkgroupTileSize(
	"iree-codegen-llvm-matmul-workgroup-size",
	llvm::cl::desc(
	"linalg.matmul tile size for workgroups spliting of M, N dimension"),
	llvm::cl::init(64));
	static llvm::cl::opt<int> matmulL1TileSize(
	"iree-codegen-llvm-matmul-l1-size",
	llvm::cl::desc(
	"linalg.matmul tile size for L1 spliting of M, N, K dimension"),
	llvm::cl::init(32));
	static llvm::cl::opt<int> matmulVectorSize(
	"iree-codegen-llvm-matmul-vector-size",
	llvm::cl::desc("linalg.matmul vector tile size"), llvm::cl::init(4));

	static llvm::cl::opt<int> batchMatmulWorkgroupTileSize(
	"iree-codegen-llvm-batch-matmul-workgroup-size",
	llvm::cl::desc("linalg.batch_matmul tile size for workgroups spliting of "
	"M, N dimension"),
	llvm::cl::init(32));
	static llvm::cl::opt<int> batchMatmulL1TileSize(
	"iree-codegen-llvm-batch-matmul-l1-size",
	llvm::cl::desc("linalg.batch_matmul tile size for L1 spliting of M, N, K "
	"dimensions"),
	llvm::cl::init(16));

	static llvm::cl::list<int> mmt4dWorkgroupTileSizes(
	"iree-codegen-llvm-mmt4d-workgroup-tile-sizes",
	llvm::cl::desc("linalg.mmt4d workgroup tile size"), llvm::cl::ZeroOrMore,
	llvm::cl::MiscFlags::CommaSeparated);

	static llvm::cl::list<int> mmt4dL1TileSizes(
	"iree-codegen-llvm-mmt4d-l1-tile-size",
	llvm::cl::desc("linalg.mmt4d L1 tile size"), llvm::cl::ZeroOrMore,
	llvm::cl::MiscFlags::CommaSeparated);

	static llvm::cl::list<int> mmt4dVectorSizes(
	"iree-codegen-llvm-mmt4d-vector-size",
	llvm::cl::desc("linalg.mmt4d vector tile size"), llvm::cl::ZeroOrMore,
	llvm::cl::MiscFlags::CommaSeparated);

	static llvm::cl::opt<int> defaultWorkgroupTileSize(
	"iree-codegen-llvm-generic-ops-workgroup-size",
	llvm::cl::desc(
	"linalg.generic and linalg.indexed_generic workgroup tile size"),
	llvm::cl::init(64));

	static Optional<int64_t> getNativeVectorSize(FuncOp entryPointFn,
	Type elementType) {
	Optional<int64_t> nativeVectorSizeInBytes = llvm::None;
	if (auto variantOp =
	entryPointFn->getParentOfType<IREE::HAL::ExecutableVariantOp>()) {
	if (IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.target()) {
	if (auto config = targetAttr.getConfiguration()) {
	if (auto nativeVectorSizeAttr =
	config.getAs<IntegerAttr>("native_vector_size")) {
	if (int64_t nativeVectorSizeVal = nativeVectorSizeAttr.getInt()) {
	nativeVectorSizeInBytes = nativeVectorSizeVal;
	}
	}
	}
	}
	}
	// TODO(ravishankarm): For now still picking the value from the
	// `iree-codegen-llvm-matmul-vector-size` option to avoid some issues on
	// RISCV-32 side.
	if (nativeVectorSizeInBytes) {
	if (elementType.isIntOrFloat()) {
	unsigned bitWidth = elementType.getIntOrFloatBitWidth() / 8;
	return (*nativeVectorSizeInBytes) / bitWidth;
	}
	}
	return matmulVectorSize.getValue();
	}

	/// Sets the lowering configuration for dispatch region with root op that
	/// implements the contraction operation interface.
	static LogicalResult setRootConfig(
	FuncOp entryPointFn, linalg::ContractionOpInterface contractionOp) {
	if (getLoweringConfig(contractionOp)) return success();
	Type elementType =
	contractionOp.lhs().getType().cast<ShapedType>().getElementType();
	auto vectorSize = getNativeVectorSize(entryPointFn, elementType);
	if (!vectorSize) return success();
	int64_t vectorSizeVal = *vectorSize;

	if (contractionOp.isRowMajorMatmul()) {
	int mWorkgroupSize = matmulWorkgroupTileSize;
	int nWorkgroupSize = matmulWorkgroupTileSize;
	int mL1TileSize = matmulL1TileSize;
	int nL1TileSize = matmulL1TileSize;
	int kL1TileSize = matmulL1TileSize;
	auto lhsShape = getUntiledShape(contractionOp.lhs());
	auto rhsShape = getUntiledShape(contractionOp.rhs());
	if (!vectorSize) return success();
	if (!lhsShape.empty() && !rhsShape.empty()) {
	// Find largest tile size that is a multiple of the vector size.
	auto getTileSize = [vectorSizeVal](int dim, int maxSize) -> int {
	if (dim == ShapedType::kDynamicSize) return maxSize;
	if (dim < vectorSizeVal) return vectorSizeVal;
	for (int i = std::min(maxSize, dim); i > 0; --i) {
	if (dim % i == 0 && i % vectorSizeVal == 0) {
	return i;
	}
	}
	return maxSize;
	};
	mWorkgroupSize = getTileSize(lhsShape[0], mWorkgroupSize);
	nWorkgroupSize = getTileSize(rhsShape[1], nWorkgroupSize);
	mL1TileSize = getTileSize(mWorkgroupSize, mL1TileSize);
	nL1TileSize = getTileSize(nWorkgroupSize, nL1TileSize);
	kL1TileSize = getTileSize(rhsShape[0], kL1TileSize);
	}
	TileSizesListType tileSizes = {
	{mWorkgroupSize, nWorkgroupSize},
	{mL1TileSize, nL1TileSize, kL1TileSize},
	{vectorSizeVal, vectorSizeVal, vectorSizeVal}};
	SmallVector<int64_t, 4> nativeVectorSize = {vectorSizeVal, vectorSizeVal,
	vectorSizeVal};
	return setOpConfigAndEntryPointFnTranslation(
	entryPointFn, contractionOp, tileSizes, nativeVectorSize,
	IREE::HAL::DispatchLoweringPassPipeline::CPUTensorToVectors);
	}
	if (contractionOp.isRowMajorBatchMatmul()) {
	// TODO(ataei, ravishankarm): This should just use the configuration for
	// matmul above. setting the tile size to 1 for all the batch dimensions.
	TileSizesListType tileSizes = {
	{1, batchMatmulWorkgroupTileSize, batchMatmulWorkgroupTileSize},
	{1, batchMatmulL1TileSize, batchMatmulL1TileSize,
	batchMatmulL1TileSize},
	{1, vectorSizeVal, vectorSizeVal, vectorSizeVal}};
	SmallVector<int64_t, 4> nativeVectorSize = {1, vectorSizeVal, vectorSizeVal,
	vectorSizeVal};
	return setOpConfigAndEntryPointFnTranslation(
	entryPointFn, contractionOp, tileSizes, nativeVectorSize,
	IREE::HAL::DispatchLoweringPassPipeline::CPUVectorization);
	}
	return success();
	}

	/// Sets the lowering configuration for dispatch region for linalg.mmt4d root op
	static LogicalResult setRootConfig(FuncOp entryPointFn,
	linalg::Mmt4DOp mmt4dOp) {
	// TODO(ataei): These are hand tuned for some performance benchmarks for now,
	// we want to adapt the same strategy as matmul that dynamically sets tile
	// size.
	auto getWorkgroupTileSizes = [&]() -> SmallVector<int64_t> {
	if (!mmt4dWorkgroupTileSizes.empty()) {
	return SmallVector<int64_t>(mmt4dWorkgroupTileSizes.begin(),
	mmt4dWorkgroupTileSizes.end());
	}
	return {48, 32};
	};

	auto getL1TileSizes = [&]() -> SmallVector<int64_t> {
	auto lhsShape = getUntiledShape(mmt4dOp.inputs()[0]);
	auto rhsShape = getUntiledShape(mmt4dOp.inputs()[1]);
	int M0 = lhsShape[2];
	int N0 = rhsShape[2];
	int K0 = lhsShape[3];
	if (!mmt4dL1TileSizes.empty()) {
	return SmallVector<int64_t>(mmt4dL1TileSizes.begin(),
	mmt4dL1TileSizes.end());
	}
	return {1, 1, 1, M0, N0, K0};
	};

	auto getVectorSizes = [&]() -> SmallVector<int64_t> {
	auto lhsShape = getUntiledShape(mmt4dOp.inputs()[0]);
	auto rhsShape = getUntiledShape(mmt4dOp.inputs()[1]);
	int M0 = lhsShape[2];
	int N0 = rhsShape[2];
	int K0 = lhsShape[3];
	if (!mmt4dVectorSizes.empty()) {
	return SmallVector<int64_t>(mmt4dVectorSizes.begin(),
	mmt4dVectorSizes.end());
	}
	return {1, 1, 1, M0, N0, K0};
	};

	SmallVector<int64_t, 4> nativeVectorSize = getVectorSizes();

	TileSizesListType tileSizes = {getWorkgroupTileSizes(), getL1TileSizes(),
	nativeVectorSize};

	return setOpConfigAndEntryPointFnTranslation(
	entryPointFn, mmt4dOp, tileSizes, nativeVectorSize,
	IREE::HAL::DispatchLoweringPassPipeline::CPUVectorization);
	}

	/// Sets the lowering configuration for dispatch region for linalg_ext.fft root
	/// op.
	static LogicalResult setRootConfig(FuncOp entryPointFn,
	linalg_ext::FftOp fftOp) {
	auto partitionedLoops = getPartitionedLoops(fftOp);
	unsigned maxDepth = partitionedLoops.back() + 1;
	SmallVector<int64_t, 4> workgroupTileSizes(maxDepth,
	defaultWorkgroupTileSize);
	llvm::DenseSet<unsigned> partitionedLoopsSet(partitionedLoops.begin(),
	partitionedLoops.end());
	for (auto dim : llvm::seq<int64_t>(0, workgroupTileSizes.size())) {
	if (!partitionedLoopsSet.count(dim)) {
	workgroupTileSizes[dim] = 0;
	}
	}

	auto rank = fftOp.getOperandRank();
	if (workgroupTileSizes.size() >= rank && workgroupTileSizes[rank - 1] != 0) {
	APInt value;
	if (matchPattern(fftOp.getStage(), m_ConstantInt(&value))) {
	workgroupTileSizes[rank - 1] = 1 << value.getSExtValue();
	workgroupTileSizes[rank - 1] =
	std::max(workgroupTileSizes[rank - 1],
	static_cast<int64_t>(defaultWorkgroupTileSize));
	} else {
	fftOp.emitError("non-constant stage might not work for fft op");
	return failure();
	}
	}
	TileSizesListType tileSizes = {workgroupTileSizes};

	return setOpConfigAndEntryPointFnTranslation(
	entryPointFn, fftOp, tileSizes, /nativeVectorSizes=/ArrayRef<int64_t>{},
	IREE::HAL::DispatchLoweringPassPipeline::CPUDefault);
	}

	/// Sets the lowering configuration for dispatch region with root op being a
	/// generic op.
	static LogicalResult setDefaultRootConfig(FuncOp entryPointFn, Operation *op) {
	auto partitionedLoops = getPartitionedLoops(op);
	if (partitionedLoops.empty()) {
	// Return success without doing anything. Eventually default will be used.
	return success();
	}
	unsigned maxDepth = partitionedLoops.back() + 1;
	SmallVector<int64_t, 4> workgroupTileSizes(maxDepth,
	defaultWorkgroupTileSize);
	llvm::DenseSet<unsigned> partitionedLoopsSet(partitionedLoops.begin(),
	partitionedLoops.end());
	for (auto dim : llvm::seq<int64_t>(0, workgroupTileSizes.size())) {
	if (!partitionedLoopsSet.count(dim)) {
	workgroupTileSizes[dim] = 0;
	}
	}
	TileSizesListType tileSizes = {workgroupTileSizes};
	return setOpConfigAndEntryPointFnTranslation(
	entryPointFn, op, tileSizes, /nativeVectorSizes=/ArrayRef<int64_t>{},
	IREE::HAL::DispatchLoweringPassPipeline::CPUVectorization);
	}

	/// Finds the root operation in the given list of linalg operations and sets its
	/// configuration. Returns the root operation.
	static LogicalResult setRootConfig(FuncOp entryPointFn,
	ArrayRef<Operation *> computeOps) {
	Operation *rootOp = nullptr;
	for (auto computeOp : computeOps) {
	if (!hasMarker(computeOp, getWorkgroupMarker())) continue;

	/// If the op already has a lowering config, then check for whether it
	/// specifies a pass-pipeline and workgroup size as well. If so use those.
	if (auto config = getLoweringConfig(computeOp)) {
	IREE::HAL::DispatchLoweringPassPipeline passPipeline =
	IREE::HAL::DispatchLoweringPassPipeline::CPUDefault;
	if (auto setPassPipeline = getLoweringPassPipeline(config)) {
	passPipeline = setPassPipeline.getValue();
	}
	SmallVector<int64_t, 4> workgroupSize;
	if (auto workgroupSizeAttr = config.workgroupSize()) {
	workgroupSize = llvm::to_vector<4>(
	llvm::map_range(workgroupSizeAttr, [](Attribute intAttr) {
	return intAttr.cast<IntegerAttr>().getInt();
	}));
	}
	if (failed(setOpConfigAndEntryPointFnTranslation(
	entryPointFn, computeOp, config, passPipeline, workgroupSize))) {
	return failure();
	}
	// Reset the op configuration to drop the pass-pipeline and workgroup size
	// info. The op does not carry that information anymore.
	auto resetConfig = IREE::HAL::LoweringConfig::get(
	config.tileSizes(), config.nativeVectorSize(),
	/passPipeline =/nullptr,
	/workgroupSize =/nullptr, computeOp->getContext());
	setLoweringConfig(computeOp, resetConfig);
	} else {
	auto setRootConfigFn = [&](Operation *op) -> LogicalResult {
	return TypeSwitch<Operation *, LogicalResult>(op)
	.Case<linalg::Mmt4DOp, linalg::ContractionOpInterface,
	linalg_ext::FftOp>(
	[&](auto op) { return setRootConfig(entryPointFn, op); })
	.Default([&](Operation *op) { return success(); });
	};
	if (failed(setRootConfigFn(computeOp))) {
	return failure();
	}
	}

	if (getLoweringConfig(computeOp)) {
	if (rootOp) {
	return computeOp->emitError(
	"unhandled multiple roots in dispatch region");
	}
	rootOp = computeOp;
	}
	}

	// If no root operation found, check if the dispatch region contains a single
	// generic op and chose pipeline based on that.
	if (!rootOp) {
	for (auto computeOp : computeOps) {
	if (!hasMarker(computeOp, getWorkgroupMarker())) continue;
	// Ignore fill ops. They never end up in their own dispatch, so are never
	// root ops.
	if (isa<linalg::FillOp>(computeOp)) continue;
	if (failed(setDefaultRootConfig(entryPointFn, computeOp))) {
	return failure();
	}
	if (getLoweringConfig(computeOp)) {
	if (rootOp) {
	return computeOp->emitError(
	"unhandled multiple roots in dispatch region");
	}
	rootOp = computeOp;
	}
	}
	}
	return success();
	}

	LogicalResult initCPULaunchConfig(ModuleOp moduleOp) {
	llvm::StringMap<IREE::HAL::ExecutableEntryPointOp> entryPointOps =
	getAllEntryPoints(moduleOp);
	for (auto funcOp : moduleOp.getOps<FuncOp>()) {
	auto entryPointOp = entryPointOps.lookup(funcOp.getName());
	if (!entryPointOp) continue;
	if (getTranslationInfo(entryPointOp)) continue;
	SmallVector<Operation *, 4> computeOps;
	SmallVector<Operation *, 4> tiledLoops;
	// If there are no linalg ops, not using Linalg based lowering.
	if (succeeded(getComputeOps(funcOp, computeOps, tiledLoops)) &&
	!computeOps.empty()) {
	if (failed(setRootConfig(funcOp, computeOps))) {
	return failure();
	}
	}

	// If the function entry point already doesnt have a lowering info attribute
	// on it, just add the default.
	SmallVector<int64_t> workloadPerWorkgroup;
	if (!tiledLoops.empty()) {
	// If the tiled loops are not empty then this could be a corner case of
	// tensor.insert_slice being tiled and distributed, that just shows up as
	// a `flow.dispatch.tensor.load` and a `flow.dispatch.tensor.store`. For
	// now just treat the tiled loops not being empty as an indicator of
	// that. Need a better way of information flow from flow dialect to hal.
	workloadPerWorkgroup.resize(tiledLoops.size(), defaultWorkgroupTileSize);
	}
	if (!getTranslationInfo(entryPointOp)) {
	setTranslationInfo(funcOp,
	IREE::HAL::DispatchLoweringPassPipeline::CPUDefault,
	/workgroupSize =/{}, workloadPerWorkgroup);
	}
	}
	return success();
	}

	} // namespace iree_compiler
	} // namespace mlir