| // Copyright 2020 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include "iree/compiler/Codegen/SPIRV/KernelConfig.h" |
| |
| #include "iree/compiler/Codegen/SPIRV/Utils.h" |
| #include "iree/compiler/Codegen/Transforms/Transforms.h" |
| #include "iree/compiler/Codegen/Utils/MarkerUtils.h" |
| #include "iree/compiler/Codegen/Utils/Utils.h" |
| #include "iree/compiler/Dialect/HAL/IR/LoweringConfig.h" |
| #include "llvm/Support/Debug.h" |
| #include "mlir/Dialect/Linalg/IR/LinalgOps.h" |
| #include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h" |
| #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h" |
| |
| #define DEBUG_TYPE "iree-spirv-kernel-config" |
| |
| namespace mlir { |
| namespace iree_compiler { |
| |
| //===----------------------------------------------------------------------===// |
| // Utilities |
| //===----------------------------------------------------------------------===// |
| |
| /// Given `nprocs`, tries to distribute it evenly across 2 logical dimensions. |
| static std::tuple<int64_t, int64_t> distributeProcs2D(int64_t nprocs) { |
| int64_t nprocs_x = std::max<int64_t>( |
| 1, static_cast<int64_t>( |
| llvm::PowerOf2Ceil(static_cast<uint64_t>(std::sqrt(nprocs))))); |
| return std::make_tuple(nprocs_x, nprocs / nprocs_x); |
| } |
| |
| /// Returns the minimum of `shape` and `tileSize` if shape is static. |
| /// Returns `tileSize` otherwise. |
| int64_t getMinIfStaticShape(int64_t shape, int64_t tileSize) { |
| if (shape == ShapedType::kDynamicSize) return tileSize; |
| return std::min(shape, tileSize); |
| } |
| |
| /// Defines the workgroup count region on entry point ops for the |
| /// `SPIRVDistributeToGlobalID` pipeline. |
| // TODO(ravishankarm): Remove this when that pipeline is deprecated. |
| static LogicalResult setTranslationUsingDistributeToGlobalId( |
| FuncOp funcOp, ArrayRef<int64_t> workgroupSize) { |
| auto entryPointOp = getEntryPoint(funcOp); |
| MLIRContext *context = entryPointOp.getContext(); |
| auto translationInfo = buildTranslationInfo( |
| IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistributeToGlobalID, |
| /*workloadPerWorkgroup =*/{}, context); |
| setTranslationInfo(entryPointOp, translationInfo, workgroupSize); |
| OpBuilder builder(context); |
| int64_t workgroupSizeX = workgroupSize[0]; |
| auto numWorkgroupsFn = [workgroupSizeX](OpBuilder &b, Location loc, |
| std::array<Value, 3> workload) { |
| AffineExpr e1, e2, e3; |
| bindSymbols(b.getContext(), e1, e2, e3); |
| AffineExpr expr = e1 * e2 * e3; |
| expr = expr.ceilDiv(workgroupSizeX); |
| Value numWorkgroupsX = linalg::applyMapToValues( |
| b, loc, AffineMap::get(0, 3, expr), workload)[0]; |
| Value one = b.create<ConstantIndexOp>(loc, 1); |
| return std::array<Value, 3>{numWorkgroupsX, one, one}; |
| }; |
| return defineWorkgroupCountRegion(builder, funcOp, numWorkgroupsFn); |
| } |
| |
| namespace detail { |
| LogicalResult defineConvWorkgroupCountRegion( |
| Operation *op, ArrayRef<int64_t> outputShape, |
| ArrayRef<int64_t> workgroupTileSizes) { |
| auto numWorkgroupsFn = [&](OpBuilder &b, Location loc, std::array<Value, 3>) { |
| std::array<Value, 3> xyz; |
| for (unsigned i = 0; i < 3; ++i) { |
| int64_t count = outputShape[i] / workgroupTileSizes[i]; |
| xyz[2 - i] = b.create<ConstantIndexOp>(loc, count); |
| } |
| return xyz; |
| }; |
| OpBuilder builder(op->getContext()); |
| return defineWorkgroupCountRegion(builder, op->getParentOfType<FuncOp>(), |
| numWorkgroupsFn); |
| } |
| } // namespace detail |
| |
| //===----------------------------------------------------------------------===// |
| // Matmul Default Configuration |
| //===----------------------------------------------------------------------===// |
| |
| static LogicalResult setOpConfig(spirv::ResourceLimitsAttr limits, |
| linalg::BatchMatmulOp op) { |
| unsigned maxWorkgroupSize = |
| limits.max_compute_workgroup_invocations().getInt(); |
| |
| // This is just being hard-wired for now to be minimal viable, but this can be |
| // decided better when we have better estimates of device charecteristics. |
| const int64_t numRowsPerThread = 1; |
| const int64_t numColsPerThread = 1; |
| const int64_t numBatchesPerThread = 1; |
| const int64_t tileSizeK = 0; |
| |
| std::array<int64_t, 3> workgroupSize = {1, 1, 1}; |
| std::tie(workgroupSize[0], workgroupSize[1]) = |
| distributeProcs2D(maxWorkgroupSize); |
| |
| auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute; |
| |
| TileSizesListType tileSizes; |
| // Workgroup level. |
| tileSizes.push_back({numBatchesPerThread, numRowsPerThread * workgroupSize[1], |
| numColsPerThread * workgroupSize[0], tileSizeK}); |
| // No tiling at the subgroup level since this target doesn't use subgroup op |
| // or shared memory. |
| tileSizes.emplace_back(); |
| // Invocation level. |
| tileSizes.push_back( |
| {numBatchesPerThread, numRowsPerThread, numColsPerThread, 0}); |
| |
| return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(), |
| op, tileSizes, {}, pipeline, |
| workgroupSize); |
| } |
| |
| static LogicalResult setOpConfig(spirv::ResourceLimitsAttr limits, |
| linalg::MatmulOp op) { |
| unsigned maxWorkgroupSize = |
| limits.max_compute_workgroup_invocations().getInt(); |
| |
| std::array<int64_t, 3> workgroupSize = {1, 1, 1}; |
| std::tie(workgroupSize[0], workgroupSize[1]) = |
| distributeProcs2D(maxWorkgroupSize); |
| |
| const int numRowsPerThread = 1; |
| const int numColsPerThread = 1; |
| int64_t tileSizeK = 0; |
| |
| ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]); |
| ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]); |
| |
| int64_t M = lhsShape[0]; |
| int64_t N = rhsShape[1]; |
| int64_t K = lhsShape[1]; |
| |
| auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute; |
| |
| TileSizesListType tileSizes; |
| // Workgroup level. |
| tileSizes.push_back( |
| {getMinIfStaticShape(M, numRowsPerThread * workgroupSize[1]), |
| getMinIfStaticShape(N, numColsPerThread * workgroupSize[0]), |
| getMinIfStaticShape(K, tileSizeK)}); |
| // No tiling at the subgroup level since this target doesn't use subgroup op |
| // or shared memory. |
| tileSizes.emplace_back(); |
| // Invocation level. |
| tileSizes.push_back({1, 1, 0}); |
| |
| return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(), |
| op, tileSizes, {}, pipeline, |
| workgroupSize); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Default Configuration |
| //===----------------------------------------------------------------------===// |
| |
| static LogicalResult setDefaultOpConfig(spirv::ResourceLimitsAttr limits, |
| Operation *op) { |
| auto partitionedLoops = getPartitionedLoops(op); |
| if (partitionedLoops.empty()) { |
| auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize; |
| std::array<int64_t, 3> workgroupSize = {1, 1, 1}; |
| auto funcOp = op->getParentOfType<FuncOp>(); |
| return setOpConfigAndEntryPointFnTranslation(funcOp, op, {}, {}, pipeline, |
| workgroupSize); |
| } |
| |
| const int64_t subgroupSize = limits.subgroup_size().getValue().getSExtValue(); |
| int64_t numElementsPerWorkgroup = subgroupSize; |
| int64_t numElementsPerThread = 1; |
| auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute; |
| |
| // Returns true if the given `operand` has 32-bit element type. |
| auto has32BitElementType = [](Value operand) { |
| auto shapedType = operand.getType().dyn_cast<ShapedType>(); |
| Type elementType = |
| (shapedType ? shapedType.getElementType() : operand.getType()); |
| return elementType.isa<FloatType>() || elementType.isInteger(32); |
| }; |
| |
| if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) { |
| bool vectorize = false; |
| auto outputShape = getUntiledResultShape(linalgOp, 0); |
| |
| if (!linalgOp.hasIndexSemantics() && |
| // Skip vectorization for non-minor identity inputs as it generates |
| // vector.transfer_read ops with permutation maps that we currently |
| // cannot lower. |
| // TODO: Remove this restriction once the lowering of the permutation |
| // map is supported in core. |
| llvm::all_of(linalgOp.getIndexingMaps(), |
| [](AffineMap &map) { return map.isMinorIdentity(); }) && |
| // TODO(thomasraoux): Lowering of integers other than i32 may require |
| // emulation. This is currently not supported for vector operation. |
| // Re-enable this when the bug is fixed on SPIR-V lowering side. |
| llvm::all_of(linalgOp->getOperands(), has32BitElementType) && |
| llvm::all_of(outputShape, |
| [](int64_t dim) { return !ShapedType::isDynamic(dim); })) { |
| vectorize = true; |
| } |
| |
| SmallVector<int64_t, 4> candidateTileSizes; |
| if (vectorize) candidateTileSizes.push_back(4 * subgroupSize); |
| candidateTileSizes.push_back(subgroupSize); |
| |
| for (int64_t size : candidateTileSizes) { |
| if (outputShape.back() % size != 0) continue; |
| numElementsPerWorkgroup = size; |
| break; |
| } |
| |
| if (numElementsPerWorkgroup <= subgroupSize || |
| outputShape.back() % numElementsPerWorkgroup != 0) { |
| vectorize = false; |
| } |
| |
| if (vectorize) { |
| numElementsPerThread = numElementsPerWorkgroup / subgroupSize; |
| pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize; |
| } |
| } |
| |
| std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1}; |
| |
| unsigned loopDepth = partitionedLoops.back() + 1; |
| SmallVector<int64_t, 4> workgroupTileSize(loopDepth, 0); |
| SmallVector<int64_t, 4> threadTileSize(loopDepth, 0); |
| |
| // Tiling along partitioned loops with size 1. |
| for (int64_t loopIndex : partitionedLoops) { |
| workgroupTileSize[loopIndex] = threadTileSize[loopIndex] = 1; |
| } |
| // Overwrite the configuration for the innermost dimension. |
| workgroupTileSize.back() = numElementsPerWorkgroup; |
| threadTileSize.back() = numElementsPerThread; |
| |
| TileSizesListType tileSizes; |
| tileSizes.push_back(workgroupTileSize); |
| tileSizes.emplace_back(); // Subgroup level. |
| tileSizes.push_back(threadTileSize); |
| |
| return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(), |
| op, tileSizes, {}, pipeline, |
| workgroupSize); |
| } |
| |
| /// Sets the CodeGen configuration as attributes to the given `rootOp` if it's a |
| /// known Linalg matmul/convolution op with good configurations. |
| static LogicalResult setSPIRVOpConfig(const spirv::TargetEnv &targetEnv, |
| Operation *rootOp) { |
| LogicalResult result = success(); |
| // First try to find a proper CodeGen configuration for the current |
| // target architecture. |
| switch (targetEnv.getVendorID()) { |
| case spirv::Vendor::ARM: |
| result = detail::setMaliCodeGenConfig(targetEnv, rootOp); |
| break; |
| case spirv::Vendor::NVIDIA: |
| result = detail::setNVIDIACodeGenConfig(targetEnv, rootOp); |
| break; |
| case spirv::Vendor::Qualcomm: |
| result = detail::setAdrenoCodeGenConfig(targetEnv, rootOp); |
| break; |
| default: |
| break; |
| } |
| |
| if (failed(result)) return result; |
| // Check whether there is actually a configuration found. If so, it's done. |
| if (getLoweringConfig(rootOp)) return result; |
| |
| // Otherwise fallback to use a default configuration. |
| spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits(); |
| return TypeSwitch<Operation *, LogicalResult>(rootOp) |
| .Case<linalg::BatchMatmulOp, linalg::MatmulOp>( |
| [limits](auto op) { return setOpConfig(limits, op); }) |
| .Case<linalg::Conv2DNhwcHwcfOp, linalg::DepthwiseConv2DNhwOp>( |
| [limits](auto op) { return setDefaultOpConfig(limits, op); }) |
| .Default([](Operation *) { return success(); }); |
| }; |
| |
| //===----------------------------------------------------------------------===// |
| // Entry Point |
| //===----------------------------------------------------------------------===// |
| |
| LogicalResult initSPIRVLaunchConfig(ModuleOp module) { |
| llvm::StringMap<IREE::HAL::ExecutableEntryPointOp> entryPointOps = |
| getAllEntryPoints(module); |
| spirv::TargetEnvAttr targetEnvAttr = getSPIRVTargetEnvAttr(module); |
| if (!targetEnvAttr) { |
| return module.emitOpError( |
| "expected parent hal.executable.variant to have spv.target_env " |
| "attribute"); |
| } |
| spirv::TargetEnv targetEnv(targetEnvAttr); |
| spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits(); |
| |
| for (auto funcOp : module.getOps<FuncOp>()) { |
| auto entryPointOp = entryPointOps.lookup(funcOp.getName()); |
| if (!entryPointOp) continue; |
| if (getTranslationInfo(entryPointOp)) continue; |
| |
| SmallVector<Operation *, 4> computeOps; |
| SmallVector<Operation *, 4> tiledLoops; |
| if (failed(getComputeOps(funcOp, computeOps, tiledLoops))) { |
| return funcOp.emitOpError("failed to get compute ops"); |
| } |
| |
| int64_t subgroupSize = |
| targetEnv.getResourceLimits().subgroup_size().getValue().getSExtValue(); |
| |
| // If the dispatch region does not contain tiled and distributed Linalg ops, |
| // invoke the pipeline to distribute to global invocations. |
| if (tiledLoops.empty() && llvm::none_of(computeOps, [](Operation *op) { |
| return hasMarker(op, getWorkgroupMarker()); |
| })) { |
| std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1}; |
| if (failed( |
| setTranslationUsingDistributeToGlobalId(funcOp, workgroupSize))) { |
| return computeOps[0]->emitOpError( |
| "failed to set translation info for distributing to global IDs"); |
| } |
| continue; |
| } |
| |
| Operation *rootOperation = nullptr; |
| |
| // Try to find a configuration according to a matmul/convolution op and use |
| // it as the root op. |
| for (Operation *computeOp : computeOps) { |
| if (failed(setSPIRVOpConfig(targetEnv, computeOp))) return failure(); |
| |
| // Check if the op configuration was set. |
| if (!getLoweringConfig(computeOp)) continue; |
| |
| if (rootOperation) { |
| return computeOp->emitOpError( |
| "unhandled multiple roots in dispatch region"); |
| } |
| rootOperation = computeOp; |
| } |
| |
| // If there are still no root op, check for any linalg.generic op. |
| if (!rootOperation) { |
| for (Operation *computeOp : reverse(computeOps)) { |
| if (failed(setDefaultOpConfig(limits, computeOp))) return failure(); |
| |
| // Check if the op configuration was set. |
| if (!getLoweringConfig(computeOp)) { |
| return computeOp->emitOpError( |
| "without known roots, the last operation in the tiled loop body " |
| "is expected to be set as root"); |
| } |
| rootOperation = computeOp; |
| break; |
| } |
| } |
| |
| if (!rootOperation) { |
| // If the tiled loops are not empty then this could be a corner case of |
| // tensor.insert_slice being tiled and distributed, that just shows up as |
| // a `flow.dispatch.tensor.load` and a `flow.dispatch.tensor.store` (or as |
| // a copy. For now just treat the tiled loops not being empty as an |
| // indicator of that. Need a better way of information flow from flow |
| // dialect to hal. |
| if (!tiledLoops.empty()) { |
| const int64_t subgroupSize = |
| limits.subgroup_size().getValue().getSExtValue(); |
| std::array<int64_t, 3> workgroupSize = {subgroupSize, 1, 1}; |
| SmallVector<int64_t> workloadPerWorkgroup(tiledLoops.size(), 1); |
| workloadPerWorkgroup.front() = subgroupSize * 4; |
| setTranslationInfo( |
| funcOp, IREE::HAL::DispatchLoweringPassPipeline::SPIRVDistribute, |
| workgroupSize, workloadPerWorkgroup); |
| return success(); |
| } |
| return funcOp.emitError("contains no root Linalg operation"); |
| } |
| |
| // Propogate the `lowering.config` attribute to the other ops. |
| // TODO(ravishankarm, antiagainst): This is a very specific use (and |
| // fragile). In general, this should not be needed. Things are already tiled |
| // and distributed. The rest of the compilation must be structured to either |
| // use `TileAndFuse` or they are independent configurations that are |
| // determined based on the op. |
| IREE::HAL::LoweringConfig config = getLoweringConfig(rootOperation); |
| for (auto op : computeOps) { |
| if (op == rootOperation) continue; |
| setLoweringConfig(op, config); |
| } |
| } |
| return success(); |
| } |
| |
| } // namespace iree_compiler |
| } // namespace mlir |