[cuda] Port over existing semaphore impl (#14325)

The current semaphore implementation actually does nothing.
Still port over it so that we can have a full implementation
passing various end-to-end tests to be based on. This makes
having a proper implementation of semaphores easier later
as we can verify correctness immediately afterwards.

Progress towards https://github.com/openxla/iree/issues/13245
diff --git a/experimental/cuda2/CMakeLists.txt b/experimental/cuda2/CMakeLists.txt
index b9be87d..58b3839 100644
--- a/experimental/cuda2/CMakeLists.txt
+++ b/experimental/cuda2/CMakeLists.txt
@@ -30,6 +30,8 @@
     "native_executable.h"
     "nop_executable_cache.c"
     "nop_executable_cache.h"
+    "nop_semaphore.c"
+    "nop_semaphore.h"
     "nccl_channel.c"
     "nccl_channel.h"
     "pipeline_layout.c"
@@ -44,6 +46,7 @@
     iree::base::internal::flatcc::parsing
     iree::hal
     iree::hal::utils::buffer_transfer
+    iree::hal::utils::semaphore_base
     iree::schemas::cuda_executable_def_c_fbs
   PUBLIC
 )
diff --git a/experimental/cuda2/cuda_device.c b/experimental/cuda2/cuda_device.c
index fd37585..de78df6 100644
--- a/experimental/cuda2/cuda_device.c
+++ b/experimental/cuda2/cuda_device.c
@@ -18,6 +18,7 @@
 #include "experimental/cuda2/nccl_channel.h"
 #include "experimental/cuda2/nccl_dynamic_symbols.h"
 #include "experimental/cuda2/nop_executable_cache.h"
+#include "experimental/cuda2/nop_semaphore.h"
 #include "experimental/cuda2/pipeline_layout.h"
 #include "experimental/cuda2/tracing.h"
 #include "iree/base/internal/arena.h"
@@ -484,15 +485,16 @@
 static iree_status_t iree_hal_cuda2_device_create_semaphore(
     iree_hal_device_t* base_device, uint64_t initial_value,
     iree_hal_semaphore_t** out_semaphore) {
-  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                          "semaphore not yet implmeneted");
+  iree_hal_cuda2_device_t* device = iree_hal_cuda2_device_cast(base_device);
+  return iree_hal_cuda2_semaphore_create(initial_value, device->host_allocator,
+                                         out_semaphore);
 }
 
 static iree_hal_semaphore_compatibility_t
 iree_hal_cuda2_device_query_semaphore_compatibility(
     iree_hal_device_t* base_device, iree_hal_semaphore_t* semaphore) {
   // TODO: implement CUDA semaphores.
-  return IREE_HAL_SEMAPHORE_COMPATIBILITY_NONE;
+  return IREE_HAL_SEMAPHORE_COMPATIBILITY_HOST_ONLY;
 }
 
 // TODO: implement multiple streams; today we only have one and queue_affinity
diff --git a/experimental/cuda2/nop_semaphore.c b/experimental/cuda2/nop_semaphore.c
new file mode 100644
index 0000000..4d27772
--- /dev/null
+++ b/experimental/cuda2/nop_semaphore.c
@@ -0,0 +1,111 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "experimental/cuda2/nop_semaphore.h"
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/utils/semaphore_base.h"
+
+typedef struct iree_hal_cuda2_semaphore_t {
+  iree_hal_semaphore_t base;
+  iree_allocator_t host_allocator;
+  iree_atomic_int64_t value;
+} iree_hal_cuda2_semaphore_t;
+
+static const iree_hal_semaphore_vtable_t iree_hal_cuda2_semaphore_vtable;
+
+static iree_hal_cuda2_semaphore_t* iree_hal_cuda2_semaphore_cast(
+    iree_hal_semaphore_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_semaphore_vtable);
+  return (iree_hal_cuda2_semaphore_t*)base_value;
+}
+
+iree_status_t iree_hal_cuda2_semaphore_create(
+    uint64_t initial_value, iree_allocator_t host_allocator,
+    iree_hal_semaphore_t** out_semaphore) {
+  IREE_ASSERT_ARGUMENT(out_semaphore);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_cuda2_semaphore_t* semaphore = NULL;
+  iree_status_t status = iree_allocator_malloc(
+      host_allocator, sizeof(*semaphore), (void**)&semaphore);
+  if (iree_status_is_ok(status)) {
+    iree_hal_semaphore_initialize(&iree_hal_cuda2_semaphore_vtable,
+                                  &semaphore->base);
+    semaphore->host_allocator = host_allocator;
+    iree_atomic_store_int64(&semaphore->value, initial_value,
+                            iree_memory_order_release);
+    *out_semaphore = &semaphore->base;
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_cuda2_semaphore_destroy(
+    iree_hal_semaphore_t* base_semaphore) {
+  iree_hal_cuda2_semaphore_t* semaphore =
+      iree_hal_cuda2_semaphore_cast(base_semaphore);
+  iree_allocator_t host_allocator = semaphore->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_hal_semaphore_deinitialize(&semaphore->base);
+  iree_allocator_free(host_allocator, semaphore);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_status_t iree_hal_cuda2_semaphore_query(
+    iree_hal_semaphore_t* base_semaphore, uint64_t* out_value) {
+  iree_hal_cuda2_semaphore_t* semaphore =
+      iree_hal_cuda2_semaphore_cast(base_semaphore);
+  // TODO: Support semaphores completely.
+  *out_value =
+      iree_atomic_load_int64(&semaphore->value, iree_memory_order_acquire);
+  return iree_ok_status();
+}
+
+static iree_status_t iree_hal_cuda2_semaphore_signal(
+    iree_hal_semaphore_t* base_semaphore, uint64_t new_value) {
+  iree_hal_cuda2_semaphore_t* semaphore =
+      iree_hal_cuda2_semaphore_cast(base_semaphore);
+  // TODO: Support semaphores completely. Return OK currently as everything is
+  // synchronized for each submit to allow things to run.
+  iree_atomic_store_int64(&semaphore->value, new_value,
+                          iree_memory_order_release);
+  iree_hal_semaphore_poll(&semaphore->base);
+  return iree_ok_status();
+}
+
+static void iree_hal_cuda2_semaphore_fail(iree_hal_semaphore_t* base_semaphore,
+                                          iree_status_t status) {
+  iree_hal_cuda2_semaphore_t* semaphore =
+      iree_hal_cuda2_semaphore_cast(base_semaphore);
+  // TODO: save status and mark timepoint as failed.
+  iree_status_ignore(status);
+  iree_hal_semaphore_poll(&semaphore->base);
+}
+
+static iree_status_t iree_hal_cuda2_semaphore_wait(
+    iree_hal_semaphore_t* base_semaphore, uint64_t value,
+    iree_timeout_t timeout) {
+  iree_hal_cuda2_semaphore_t* semaphore =
+      iree_hal_cuda2_semaphore_cast(base_semaphore);
+  // TODO: Support semaphores completely. Return OK currently as everything is
+  // synchronized for each submit to allow things to run.
+  iree_hal_semaphore_poll(&semaphore->base);
+  return iree_ok_status();
+}
+
+static const iree_hal_semaphore_vtable_t iree_hal_cuda2_semaphore_vtable = {
+    .destroy = iree_hal_cuda2_semaphore_destroy,
+    .query = iree_hal_cuda2_semaphore_query,
+    .signal = iree_hal_cuda2_semaphore_signal,
+    .fail = iree_hal_cuda2_semaphore_fail,
+    .wait = iree_hal_cuda2_semaphore_wait,
+};
diff --git a/experimental/cuda2/nop_semaphore.h b/experimental/cuda2/nop_semaphore.h
new file mode 100644
index 0000000..f8038f2
--- /dev/null
+++ b/experimental/cuda2/nop_semaphore.h
@@ -0,0 +1,29 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef EXPERIMENTAL_CUDA2_NOP_SEMAPHORE_H_
+#define EXPERIMENTAL_CUDA2_NOP_SEMAPHORE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a HAL semaphore for CUDA that does not perform real synchronization.
+// This is expected to work with a command buffer that serializes all commands.
+iree_status_t iree_hal_cuda2_semaphore_create(
+    uint64_t initial_value, iree_allocator_t host_allocator,
+    iree_hal_semaphore_t** out_semaphore);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // EXPERIMENTAL_CUDA2_NOP_SEMAPHORE_H_