[cuda] Add command-line option to drop legacy sync mode (#15582)

Legacy sync mode forces waiting on semaphore immediately so
it effectively runs all async allocation/execution in sync mode.
Now we have proper semaphore emulation in cuda2, we don't
need this anymore. Disable it for end-to-end op tests for
experimental cuda2 driver.

Progress towards https://github.com/openxla/iree/issues/13245
diff --git a/compiler/plugins/target/CUDA/CUDATarget.cpp b/compiler/plugins/target/CUDA/CUDATarget.cpp
index 2e976bb..59e581c 100644
--- a/compiler/plugins/target/CUDA/CUDATarget.cpp
+++ b/compiler/plugins/target/CUDA/CUDATarget.cpp
@@ -62,6 +62,7 @@
   bool clUsePtxas = false;
   std::string clUsePtxasFrom;
   std::string clUsePtxasParams;
+  bool enableLegacySync = true;
 
   void bindOptions(OptionsBinder &binder) {
     static llvm::cl::OptionCategory category("CUDA HAL Target");
@@ -104,6 +105,12 @@
         "iree-hal-cuda-use-ptxas-params", clUsePtxasParams,
         llvm::cl::cat(category),
         llvm::cl::desc("Passes the given additional parameters to ptxas."));
+
+    binder.opt<bool>(
+        "iree-hal-cuda-enable-legacy-sync", enableLegacySync,
+        llvm::cl::cat(category),
+        llvm::cl::desc(
+            "Enable legacy sync mode that handles semaphores synchronously."));
   }
 };
 } // namespace
@@ -390,7 +397,9 @@
 
     // Indicates that the runtime HAL driver operates only in the legacy
     // synchronous mode.
-    configItems.emplace_back(b.getStringAttr("legacy_sync"), b.getUnitAttr());
+    if (options.enableLegacySync) {
+      configItems.emplace_back(b.getStringAttr("legacy_sync"), b.getUnitAttr());
+    }
 
     configItems.emplace_back(b.getStringAttr("executable_targets"),
                              getExecutableTargets(context));
diff --git a/experimental/cuda2/tests/stablehlo_ops/CMakeLists.txt b/experimental/cuda2/tests/stablehlo_ops/CMakeLists.txt
index 482f154..2b1772e 100644
--- a/experimental/cuda2/tests/stablehlo_ops/CMakeLists.txt
+++ b/experimental/cuda2/tests/stablehlo_ops/CMakeLists.txt
@@ -76,6 +76,7 @@
     "--iree-input-type=stablehlo"
     # TODO(#13984): We need memset emulation to workaround CUDA graph issues for now.
     "--iree-stream-emulate-memset"
+    "--iree-hal-cuda-enable-legacy-sync=false"
   RUNNER_ARGS
     "--cuda2_use_streams=false"
   LABELS
@@ -156,6 +157,7 @@
     "cuda2"
   COMPILER_FLAGS
     "--iree-input-type=stablehlo"
+    "--iree-hal-cuda-enable-legacy-sync=false"
   RUNNER_ARGS
     "--cuda2_use_streams=true"
   LABELS
diff --git a/experimental/cuda2/tests/tosa_ops/CMakeLists.txt b/experimental/cuda2/tests/tosa_ops/CMakeLists.txt
index 88752fe..e82be08 100644
--- a/experimental/cuda2/tests/tosa_ops/CMakeLists.txt
+++ b/experimental/cuda2/tests/tosa_ops/CMakeLists.txt
@@ -57,6 +57,7 @@
     "--iree-input-type=tosa"
     # TODO(#13984): We need memset emulation to workaround CUDA graph issues for now.
     "--iree-stream-emulate-memset"
+    "--iree-hal-cuda-enable-legacy-sync=false"
   RUNNER_ARGS
     "--cuda2_use_streams=false"
   LABELS
@@ -118,6 +119,7 @@
     "cuda2"
   COMPILER_FLAGS
     "--iree-input-type=tosa"
+    "--iree-hal-cuda-enable-legacy-sync=false"
   RUNNER_ARGS
     "--cuda2_use_streams=true"
   LABELS