Set maximum number of threads in the thread block for CUDA target
PTX programming models provides some performance tuning directives; see https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives The downstream compiler namely ptxas leverages these information for better register allocation or to handle other resource management that improves the performance.
As far as I understand, iree knows the number of threads for thread blocks. This PR sets `maxntid`.
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
index 17a8a9c..787393c 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
@@ -277,6 +277,21 @@
llvm::MDNode::get(llvmModule->getContext(), llvmMetadata);
llvmModule->getOrInsertNamedMetadata("nvvm.annotations")
->addOperand(llvmMetadataNode);
+ /* Set maximum number of threads in the thread block (CTA). */
+ auto generateMetadata = [&](int dim, StringRef name) {
+ llvm::Metadata *llvmMetadata[] = {
+ llvm::ValueAsMetadata::get(llvmFunc),
+ llvm::MDString::get(llvmModule->getContext(), name),
+ llvm::ValueAsMetadata::get(llvm::ConstantInt::get(
+ llvm::Type::getInt32Ty(llvmModule->getContext()), dim))};
+ llvm::MDNode *llvmMetadataNode =
+ llvm::MDNode::get(llvmModule->getContext(), llvmMetadata);
+ llvmModule->getOrInsertNamedMetadata("nvvm.annotations")
+ ->addOperand(llvmMetadataNode);
+ };
+ generateMetadata(workgroupSize[0], "maxntidx");
+ generateMetadata(workgroupSize[1], "maxntidy");
+ generateMetadata(workgroupSize[2], "maxntidz");
}
std::unique_ptr<llvm::TargetMachine> targetMachine;