Set maximum number of threads in the thread block for CUDA target

PTX programming models provides some performance tuning directives; see https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#performance-tuning-directives The downstream compiler namely ptxas leverages these information for better register allocation or to handle other resource management that improves the performance.

As far as I understand, iree knows the number of threads for thread blocks. This PR sets `maxntid`.
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
index 17a8a9c..787393c 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
@@ -277,6 +277,21 @@
           llvm::MDNode::get(llvmModule->getContext(), llvmMetadata);
       llvmModule->getOrInsertNamedMetadata("nvvm.annotations")
           ->addOperand(llvmMetadataNode);
+      /* Set maximum number of threads in the thread block (CTA). */
+      auto generateMetadata = [&](int dim, StringRef name) {
+        llvm::Metadata *llvmMetadata[] = {
+            llvm::ValueAsMetadata::get(llvmFunc),
+            llvm::MDString::get(llvmModule->getContext(), name),
+            llvm::ValueAsMetadata::get(llvm::ConstantInt::get(
+                llvm::Type::getInt32Ty(llvmModule->getContext()), dim))};
+        llvm::MDNode *llvmMetadataNode =
+            llvm::MDNode::get(llvmModule->getContext(), llvmMetadata);
+        llvmModule->getOrInsertNamedMetadata("nvvm.annotations")
+            ->addOperand(llvmMetadataNode);
+      };
+      generateMetadata(workgroupSize[0], "maxntidx");
+      generateMetadata(workgroupSize[1], "maxntidy");
+      generateMetadata(workgroupSize[2], "maxntidz");
     }
 
     std::unique_ptr<llvm::TargetMachine> targetMachine;