blob: f03ca9399ce7be186f68c9dc99ccf8ed22755f61 [file] [log] [blame]
// Copyright 2021 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_
#define IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
namespace mlir::iree_compiler {
static constexpr int32_t kNumGPUDims = 3;
static constexpr int32_t kWarpSize = 32;
//===----------------------------------------------------------------------===//
// GPU processor IDs and sizes
//===----------------------------------------------------------------------===//
llvm::SmallVector<linalg::ProcInfo, 2>
getGPUThreadIdsAndCounts(OpBuilder &builder, Location loc, unsigned numDims,
llvm::ArrayRef<int64_t> workgroupSize);
/// Computes subgroup ID and returns in (X, Y, Z) order.
///
/// Note that CUDA doesn't have a subgroupId equivalent so we are are computing
/// the subgroup ID based on the threadID. When tiling to warp we assume each
/// warp is full and we pick a workgroup size so that `workgroupSize.x %
/// warpSize == 0`. This is why we can have warpId = { threadId.x / warpSize,
/// threadId.y, threadId.z }.
llvm::SmallVector<linalg::ProcInfo, 2>
getSubgroupIdsAndCounts(OpBuilder &builder, Location loc, unsigned warpSize,
unsigned numDims, llvm::ArrayRef<int64_t> numSubgroups);
//===----------------------------------------------------------------------===//
// GPU vectorization
//===----------------------------------------------------------------------===//
/// Returns true if we can use all threads to perform vectorized load/store of
/// the given `shape`.
bool canPerformVectorAccessUsingAllThreads(ArrayRef<int64_t> shape,
int64_t threadCount,
int64_t vectorSize);
/// Pick an unrolling order that will allow tensorcore operation to reuse LHS
/// register. This is needed to get good performance on sm_80 target.
std::optional<SmallVector<int64_t>>
gpuMmaUnrollOrder(vector::ContractionOp contract);
//===----------------------------------------------------------------------===//
// GPU workgroup memory
//===----------------------------------------------------------------------===//
/// Allocates GPU workgroup memory matching the given `subview`. If there are
/// dynamic dimensions, the bounds are in `sizeBounds`.
std::optional<Value> allocateWorkgroupMemory(OpBuilder &builder,
memref::SubViewOp subview,
ArrayRef<Value> sizeBounds,
DataLayout &);
/// Deallocates GPU workgroup memory behind `buffer`.
LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value buffer);
/// Copies `src` value to `dst` in shared memory.
LogicalResult copyToWorkgroupMemory(OpBuilder &builder, Value src, Value dst);
/// Propagates shared memory copy to producer linalg.fill or consumer
/// linalg.generic when possible.
void propagateSharedMemoryCopy(mlir::FunctionOpInterface funcOp);
/// Inserts barriers before and after shared memory copy.
void insertBarriersAroundSharedMemoryCopy(mlir::FunctionOpInterface funcOp);
/// Emit reduction across a group for a given input. Emits `gpu.shuffle`
/// based reduction only when `expandSubgroupReduce` is set.
Value emitGPUGroupReduction(Location loc, OpBuilder &builder, Value input,
vector::CombiningKind kind, uint32_t size,
int warpSize, bool expandSubgroupReduce);
/// Return the native size of an operation used in contraction calculation.
// TODO: Make this take HW specific sizes.
std::optional<SmallVector<int64_t>> getWmmaNativeVectorSize(Operation *op);
/// Helper function to return native size for MMA.SYNC-based operations.
std::optional<SmallVector<int64_t>> getMmaNativeVectorSize(Operation *op);
/// Return true if the given memref has workgroup memory space.
bool hasSharedMemoryAddressSpace(MemRefType memrefType);
/// Packs vector of lower precision into a single 32-bit width element.
/// (i.e <2xf16> -> i32 and <4xi8> -> i32)
Value packVectorToSupportedWidth(Location loc, OpBuilder &builder, Value input);
/// Unpack single scalar element into a target vector type.
/// (i.e i32 -> vector<4xi8> or f32 -> vector<2xf16>)
Value unpackToVector(Location loc, OpBuilder &builder, Value packedInput,
VectorType targetVecType);
//===----------------------------------------------------------------------===//
// GPU CodeGen op filter
//===----------------------------------------------------------------------===//
/// Returns true if the index map represents a transpose that benefits from
/// using shared memory when CodeGen towards the GPU.
bool sharedMemTransposeFilter(AffineMap indexMap);
//===----------------------------------------------------------------------===//
// GPU UKernel Utils
//===----------------------------------------------------------------------===//
/// Checks if target Chip(StringRef) has UKernel support.
bool hasUkernelSupportedRocmArch(StringRef targetChip);
/// Checks if targetAttr's GPU target has UKernel support.
bool hasUkernelSupportedGpuArch(IREE::HAL::ExecutableTargetAttr targetAttr);
//===----------------------------------------------------------------------===//
// GPU Target Information
//===----------------------------------------------------------------------===//
FailureOr<ArrayAttr> getSupportedMmaTypes(DictionaryAttr config);
FailureOr<ArrayAttr> getSupportedMmaTypes(mlir::FunctionOpInterface entryPoint);
} // namespace mlir::iree_compiler
#endif // IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_