compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h - 3p/openxla/iree - Git at Google

 // Copyright 2021 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #ifndef IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_
 #define IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_

 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"

 namespace mlir::iree_compiler {

 static constexpr int32_t kNumGPUDims = 3;
 static constexpr int32_t kWarpSize = 32;

 //===----------------------------------------------------------------------===//
 // GPU processor IDs and sizes
 //===----------------------------------------------------------------------===//

 llvm::SmallVector<linalg::ProcInfo, 2>
 getGPUThreadIdsAndCounts(OpBuilder &builder, Location loc, unsigned numDims,
                          llvm::ArrayRef<int64_t> workgroupSize);

 /// Computes subgroup ID and returns in (X, Y, Z) order.
 ///
 /// Note that CUDA doesn't have a subgroupId equivalent so we are are computing
 /// the subgroup ID based on the threadID. When tiling to warp we assume each
 /// warp is full and we pick a workgroup size so that `workgroupSize.x %
 /// warpSize == 0`. This is why we can have warpId = { threadId.x / warpSize,
 /// threadId.y, threadId.z }.
 llvm::SmallVector<linalg::ProcInfo, 2>
 getSubgroupIdsAndCounts(OpBuilder &builder, Location loc, unsigned warpSize,
                         unsigned numDims, llvm::ArrayRef<int64_t> numSubgroups);

 //===----------------------------------------------------------------------===//
 // GPU vectorization
 //===----------------------------------------------------------------------===//

 /// Returns true if we can use all threads to perform vectorized load/store of
 /// the given `shape`.
 bool canPerformVectorAccessUsingAllThreads(ArrayRef<int64_t> shape,
                                            int64_t threadCount,
                                            int64_t vectorSize);

 /// Pick an unrolling order that will allow tensorcore operation to reuse LHS
 /// register. This is needed to get good performance on sm_80 target.
 std::optional<SmallVector<int64_t>>
 gpuMmaUnrollOrder(vector::ContractionOp contract);

 //===----------------------------------------------------------------------===//
 // GPU workgroup memory
 //===----------------------------------------------------------------------===//

 /// Allocates GPU workgroup memory matching the given `subview`. If there are
 /// dynamic dimensions, the bounds are in `sizeBounds`.
 std::optional<Value> allocateWorkgroupMemory(OpBuilder &builder,
                                              memref::SubViewOp subview,
                                              ArrayRef<Value> sizeBounds,
                                              DataLayout &);

 /// Deallocates GPU workgroup memory behind `buffer`.
 LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value buffer);

 /// Copies `src` value to `dst` in shared memory.
 LogicalResult copyToWorkgroupMemory(OpBuilder &builder, Value src, Value dst);

 /// Propagates shared memory copy to producer linalg.fill or consumer
 /// linalg.generic when possible.
 void propagateSharedMemoryCopy(mlir::FunctionOpInterface funcOp);

 /// Inserts barriers before and after shared memory copy.
 void insertBarriersAroundSharedMemoryCopy(mlir::FunctionOpInterface funcOp);

 /// Emit reduction across a group for a given input. Emits `gpu.shuffle`
 /// based reduction only when `expandSubgroupReduce` is set.
 Value emitGPUGroupReduction(Location loc, OpBuilder &builder, Value input,
                             vector::CombiningKind kind, uint32_t size,
                             int warpSize, bool expandSubgroupReduce);

 /// Return the native size of an operation used in contraction calculation.
 // TODO: Make this take HW specific sizes.
 std::optional<SmallVector<int64_t>> getWmmaNativeVectorSize(Operation *op);

 /// Helper function to return native size for MMA.SYNC-based operations.
 std::optional<SmallVector<int64_t>> getMmaNativeVectorSize(Operation *op);

 /// Return true if the given memref has workgroup memory space.
 bool hasSharedMemoryAddressSpace(MemRefType memrefType);

 /// Packs vector of lower precision into a single 32-bit width element.
 /// (i.e <2xf16> -> i32 and <4xi8> -> i32)
 Value packVectorToSupportedWidth(Location loc, OpBuilder &builder, Value input);

 /// Unpack single scalar element into a target vector type.
 /// (i.e i32 -> vector<4xi8> or f32 -> vector<2xf16>)
 Value unpackToVector(Location loc, OpBuilder &builder, Value packedInput,
                      VectorType targetVecType);

 //===----------------------------------------------------------------------===//
 // GPU CodeGen op filter
 //===----------------------------------------------------------------------===//

 /// Returns true if the index map represents a transpose that benefits from
 /// using shared memory when CodeGen towards the GPU.
 bool sharedMemTransposeFilter(AffineMap indexMap);

 //===----------------------------------------------------------------------===//
 // GPU UKernel Utils
 //===----------------------------------------------------------------------===//

 /// Checks if target Chip(StringRef) has UKernel support.
 bool hasUkernelSupportedRocmArch(StringRef targetChip);

 /// Checks if targetAttr's GPU target has UKernel support.
 bool hasUkernelSupportedGpuArch(IREE::HAL::ExecutableTargetAttr targetAttr);

 //===----------------------------------------------------------------------===//
 // GPU Target Information
 //===----------------------------------------------------------------------===//
 FailureOr<ArrayAttr> getSupportedMmaTypes(DictionaryAttr config);

 FailureOr<ArrayAttr> getSupportedMmaTypes(mlir::FunctionOpInterface entryPoint);

 } // namespace mlir::iree_compiler

 #endif // IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_
	// Copyright 2021 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	#ifndef IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_
	#define IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_

	#include "iree/compiler/Codegen/Utils/Utils.h"
	#include "mlir/Dialect/MemRef/IR/MemRef.h"
	#include "mlir/Dialect/Vector/IR/VectorOps.h"

	namespace mlir::iree_compiler {

	static constexpr int32_t kNumGPUDims = 3;
	static constexpr int32_t kWarpSize = 32;

	//===----------------------------------------------------------------------===//
	// GPU processor IDs and sizes
	//===----------------------------------------------------------------------===//

	llvm::SmallVector<linalg::ProcInfo, 2>
	getGPUThreadIdsAndCounts(OpBuilder &builder, Location loc, unsigned numDims,
	llvm::ArrayRef<int64_t> workgroupSize);

	/// Computes subgroup ID and returns in (X, Y, Z) order.
	///
	/// Note that CUDA doesn't have a subgroupId equivalent so we are are computing
	/// the subgroup ID based on the threadID. When tiling to warp we assume each
	/// warp is full and we pick a workgroup size so that `workgroupSize.x %
	/// warpSize == 0`. This is why we can have warpId = { threadId.x / warpSize,
	/// threadId.y, threadId.z }.
	llvm::SmallVector<linalg::ProcInfo, 2>
	getSubgroupIdsAndCounts(OpBuilder &builder, Location loc, unsigned warpSize,
	unsigned numDims, llvm::ArrayRef<int64_t> numSubgroups);

	//===----------------------------------------------------------------------===//
	// GPU vectorization
	//===----------------------------------------------------------------------===//

	/// Returns true if we can use all threads to perform vectorized load/store of
	/// the given `shape`.
	bool canPerformVectorAccessUsingAllThreads(ArrayRef<int64_t> shape,
	int64_t threadCount,
	int64_t vectorSize);

	/// Pick an unrolling order that will allow tensorcore operation to reuse LHS
	/// register. This is needed to get good performance on sm_80 target.
	std::optional<SmallVector<int64_t>>
	gpuMmaUnrollOrder(vector::ContractionOp contract);

	//===----------------------------------------------------------------------===//
	// GPU workgroup memory
	//===----------------------------------------------------------------------===//

	/// Allocates GPU workgroup memory matching the given `subview`. If there are
	/// dynamic dimensions, the bounds are in `sizeBounds`.
	std::optional<Value> allocateWorkgroupMemory(OpBuilder &builder,
	memref::SubViewOp subview,
	ArrayRef<Value> sizeBounds,
	DataLayout &);

	/// Deallocates GPU workgroup memory behind `buffer`.
	LogicalResult deallocateWorkgroupMemory(OpBuilder &, Value buffer);

	/// Copies `src` value to `dst` in shared memory.
	LogicalResult copyToWorkgroupMemory(OpBuilder &builder, Value src, Value dst);

	/// Propagates shared memory copy to producer linalg.fill or consumer
	/// linalg.generic when possible.
	void propagateSharedMemoryCopy(mlir::FunctionOpInterface funcOp);

	/// Inserts barriers before and after shared memory copy.
	void insertBarriersAroundSharedMemoryCopy(mlir::FunctionOpInterface funcOp);

	/// Emit reduction across a group for a given input. Emits `gpu.shuffle`
	/// based reduction only when `expandSubgroupReduce` is set.
	Value emitGPUGroupReduction(Location loc, OpBuilder &builder, Value input,
	vector::CombiningKind kind, uint32_t size,
	int warpSize, bool expandSubgroupReduce);

	/// Return the native size of an operation used in contraction calculation.
	// TODO: Make this take HW specific sizes.
	std::optional<SmallVector<int64_t>> getWmmaNativeVectorSize(Operation *op);

	/// Helper function to return native size for MMA.SYNC-based operations.
	std::optional<SmallVector<int64_t>> getMmaNativeVectorSize(Operation *op);

	/// Return true if the given memref has workgroup memory space.
	bool hasSharedMemoryAddressSpace(MemRefType memrefType);

	/// Packs vector of lower precision into a single 32-bit width element.
	/// (i.e <2xf16> -> i32 and <4xi8> -> i32)
	Value packVectorToSupportedWidth(Location loc, OpBuilder &builder, Value input);

	/// Unpack single scalar element into a target vector type.
	/// (i.e i32 -> vector<4xi8> or f32 -> vector<2xf16>)
	Value unpackToVector(Location loc, OpBuilder &builder, Value packedInput,
	VectorType targetVecType);

	//===----------------------------------------------------------------------===//
	// GPU CodeGen op filter
	//===----------------------------------------------------------------------===//

	/// Returns true if the index map represents a transpose that benefits from
	/// using shared memory when CodeGen towards the GPU.
	bool sharedMemTransposeFilter(AffineMap indexMap);

	//===----------------------------------------------------------------------===//
	// GPU UKernel Utils
	//===----------------------------------------------------------------------===//

	/// Checks if target Chip(StringRef) has UKernel support.
	bool hasUkernelSupportedRocmArch(StringRef targetChip);

	/// Checks if targetAttr's GPU target has UKernel support.
	bool hasUkernelSupportedGpuArch(IREE::HAL::ExecutableTargetAttr targetAttr);

	//===----------------------------------------------------------------------===//
	// GPU Target Information
	//===----------------------------------------------------------------------===//
	FailureOr<ArrayAttr> getSupportedMmaTypes(DictionaryAttr config);

	FailureOr<ArrayAttr> getSupportedMmaTypes(mlir::FunctionOpInterface entryPoint);

	} // namespace mlir::iree_compiler

	#endif // IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_