Set maximum number of threads in the thread block for CUDA target PTX programming models provides some performance tuning directives; see https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives The downstream compiler namely ptxas leverages these information for better register allocation or to handle other resource management that improves the performance. As far as I understand, iree knows the number of threads for thread blocks. This PR sets `maxntid`.

commit: 214847af3d747f97518a7056323554c1cb1dfca2 [log] [tgz]
author: Guray Ozen <guray.ozen@gmail.com> Thu Dec 01 13:34:18 2022 +0100
committer: Guray Ozen <guray.ozen@gmail.com> Thu Dec 01 13:34:18 2022 +0100
tree: c6a024cb0faa0562f44c2eab298bd5ff4f1cb017
parent: 275280c2803f463b443b841eb78279dcac956ea6 [diff]
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
index 17a8a9c..787393c 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp

@@ -277,6 +277,21 @@
           llvm::MDNode::get(llvmModule->getContext(), llvmMetadata);
       llvmModule->getOrInsertNamedMetadata("nvvm.annotations")
           ->addOperand(llvmMetadataNode);
+      /* Set maximum number of threads in the thread block (CTA). */
+      auto generateMetadata = [&](int dim, StringRef name) {
+        llvm::Metadata *llvmMetadata[] = {
+            llvm::ValueAsMetadata::get(llvmFunc),
+            llvm::MDString::get(llvmModule->getContext(), name),
+            llvm::ValueAsMetadata::get(llvm::ConstantInt::get(
+                llvm::Type::getInt32Ty(llvmModule->getContext()), dim))};
+        llvm::MDNode *llvmMetadataNode =
+            llvm::MDNode::get(llvmModule->getContext(), llvmMetadata);
+        llvmModule->getOrInsertNamedMetadata("nvvm.annotations")
+            ->addOperand(llvmMetadataNode);
+      };
+      generateMetadata(workgroupSize[0], "maxntidx");
+      generateMetadata(workgroupSize[1], "maxntidy");
+      generateMetadata(workgroupSize[2], "maxntidz");
     }
 
     std::unique_ptr<llvm::TargetMachine> targetMachine;
commit	214847af3d747f97518a7056323554c1cb1dfca2	[log] [tgz]
author	Guray Ozen <guray.ozen@gmail.com>	Thu Dec 01 13:34:18 2022 +0100
committer	Guray Ozen <guray.ozen@gmail.com>	Thu Dec 01 13:34:18 2022 +0100
tree	c6a024cb0faa0562f44c2eab298bd5ff4f1cb017
parent	275280c2803f463b443b841eb78279dcac956ea6 [diff]