Move CUDA llvm optimization to the new pass manager (#11348)

Old pass manager pieces are starting to be removed.
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
index e613599..69caaca 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
@@ -14,12 +14,16 @@
 #include "iree/compiler/Utils/FlatbufferUtils.h"
 #include "iree/compiler/Utils/StringUtils.h"
 #include "iree/schemas/cuda_executable_def_builder.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO.h"
@@ -50,6 +54,11 @@
     "iree-hal-cuda-llvm-target-arch", llvm::cl::desc("LLVM target chip."),
     llvm::cl::init("sm_35"));
 
+namespace llvm {
+class FunctionPass;
+FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
+FunctionPass *createNVVMReflectPass(unsigned int SmVersion);
+}  // namespace llvm
 namespace mlir {
 namespace iree_compiler {
 namespace IREE {
@@ -124,25 +133,43 @@
                             llvm::TargetMachine &targetMachine) {
   if (requiresDeviceLib(module)) linkModule(module);
 
-  llvm::legacy::FunctionPassManager FPM(&module);
-  llvm::legacy::PassManager MPM;
-  llvm::PassManagerBuilder builder;
-  builder.OptLevel = 2;
-  builder.SizeLevel = 0;
-  builder.Inliner = llvm::createFunctionInliningPass();
-  builder.LoopVectorize = false;
+  // Workaround run those passed ahead as they are temporarily disabled in NVPTX
+  // target.
+  llvm::legacy::PassManager legacyPM;
+  legacyPM.add(llvm::createNVVMIntrRangePass(35));
+  legacyPM.add(llvm::createNVVMReflectPass(35));
+  legacyPM.run(module);
 
-  targetMachine.adjustPassManager(builder);
+  llvm::LoopAnalysisManager lam;
+  llvm::FunctionAnalysisManager fam;
+  llvm::CGSCCAnalysisManager cgam;
+  llvm::ModuleAnalysisManager mam;
 
-  builder.populateFunctionPassManager(FPM);
-  builder.populateModulePassManager(MPM);
+  fam.registerPass([&] { return targetMachine.getTargetIRAnalysis(); });
 
-  FPM.doInitialization();
-  for (llvm::Function &func : module) {
-    FPM.run(func);
-  }
-  FPM.doFinalization();
-  MPM.run(module);
+  llvm::PipelineTuningOptions pto;
+  pto.SLPVectorization = false;
+
+  llvm::PassInstrumentationCallbacks pic;
+
+  llvm::StandardInstrumentations si(false);
+  si.registerCallbacks(pic, &fam);
+
+  llvm::PassBuilder pb(&targetMachine, pto, llvm::None, &pic);
+  pb.registerModuleAnalyses(mam);
+  pb.registerCGSCCAnalyses(cgam);
+  pb.registerFunctionAnalyses(fam);
+  pb.registerLoopAnalyses(lam);
+  pb.crossRegisterProxies(lam, fam, cgam, mam);
+
+  llvm::OptimizationLevel ol = llvm::OptimizationLevel::O2;
+
+  llvm::ModulePassManager mpm;
+  mpm.addPass(llvm::VerifierPass());
+  mpm.addPass(pb.buildPerModuleDefaultPipeline(ol));
+  mpm.addPass(llvm::VerifierPass());
+
+  mpm.run(module, mam);
 }
 
 class CUDATargetBackend final : public TargetBackend {