[vulkan] Support emulated timeline semaphores
This commit adds a iree::hal::Semaphore implementation for Vulkan
using binary semaphores and fences. It supports the functionality
of a timeline semaphore while guaranteeing the usage requirements
of binary semaphores and fences. In order to do that, we also need
a special iree::hal::CommandQueue implementation for Vulkan that
is able to defer releasing submissions to the GPU.
Progress on https://github.com/google/iree/issues/2002.
Closes https://github.com/google/iree/pull/2208
PiperOrigin-RevId: 318074032
diff --git a/iree/hal/vulkan/BUILD b/iree/hal/vulkan/BUILD
index 09130826..d4c5753 100644
--- a/iree/hal/vulkan/BUILD
+++ b/iree/hal/vulkan/BUILD
@@ -37,6 +37,8 @@
},
)
+# TODO(antiagainst): expose configuration for emulated timeline semaphore
+
cc_library(
name = "api",
srcs = ["api.cc"],
@@ -188,6 +190,25 @@
)
cc_library(
+ name = "emulated_timeline_semaphore",
+ srcs = ["emulated_timeline_semaphore.cc"],
+ hdrs = ["emulated_timeline_semaphore.h"],
+ deps = [
+ ":handle_util",
+ ":status_util",
+ ":timepoint_util",
+ "//iree/base:intrusive_list",
+ "//iree/base:status",
+ "//iree/base:tracing",
+ "//iree/hal:semaphore",
+ "@com_google_absl//absl/container:inlined_vector",
+ "@com_google_absl//absl/synchronization",
+ "@com_google_absl//absl/time",
+ "@iree_vulkan_headers//:vulkan_headers_no_prototypes",
+ ],
+)
+
+cc_library(
name = "extensibility_util",
srcs = ["extensibility_util.cc"],
hdrs = ["extensibility_util.h"],
@@ -327,6 +348,24 @@
)
cc_library(
+ name = "serializing_command_queue",
+ srcs = ["serializing_command_queue.cc"],
+ hdrs = ["serializing_command_queue.h"],
+ deps = [
+ ":direct_command_buffer",
+ ":emulated_timeline_semaphore",
+ ":handle_util",
+ ":status_util",
+ ":timepoint_util",
+ "//iree/base:status",
+ "//iree/base:tracing",
+ "//iree/hal:command_queue",
+ "@com_google_absl//absl/container:inlined_vector",
+ "@com_google_absl//absl/synchronization",
+ ],
+)
+
+cc_library(
name = "status_util",
srcs = ["status_util.cc"],
hdrs = ["status_util.h"],
@@ -337,6 +376,21 @@
)
cc_library(
+ name = "timepoint_util",
+ srcs = ["timepoint_util.cc"],
+ hdrs = ["timepoint_util.h"],
+ deps = [
+ ":handle_util",
+ "//iree/base:intrusive_list",
+ "//iree/base:ref_ptr",
+ "//iree/base:status",
+ "//iree/base:tracing",
+ "@com_google_absl//absl/synchronization",
+ "@iree_vulkan_headers//:vulkan_headers_no_prototypes",
+ ],
+)
+
+cc_library(
name = "vma_allocator",
srcs = [
"internal_vk_mem_alloc.cc",
@@ -385,6 +439,7 @@
":direct_command_buffer",
":direct_command_queue",
":dynamic_symbols",
+ ":emulated_timeline_semaphore",
":extensibility_util",
":handle_util",
":native_descriptor_set",
@@ -392,6 +447,7 @@
":native_timeline_semaphore",
":pipeline_cache",
":pipeline_executable_layout",
+ ":serializing_command_queue",
":status_util",
":vma_allocator",
"//iree/base:math",
diff --git a/iree/hal/vulkan/CMakeLists.txt b/iree/hal/vulkan/CMakeLists.txt
index 0d2c52e..e534073 100644
--- a/iree/hal/vulkan/CMakeLists.txt
+++ b/iree/hal/vulkan/CMakeLists.txt
@@ -12,6 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# TODO(antiagainst): We should probably always compiling the emulation in and
+# probe at runtime to enable if the device does not support native timeline
+# semaphore.
+option(IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORE
+ "Emulates timeline semaphore with binary semaphores and fences" OFF)
+
+if(NOT IREE_HAL_DRIVER_VULKAN)
+ set(IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORE OFF CACHE BOOL "" FORCE)
+endif()
+
+if(IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORE)
+ set(IREE_EMULATE_TIMELINE_SEMAPHORE 1)
+else()
+ set(IREE_EMULATE_TIMELINE_SEMAPHORE 0)
+endif()
+
set(VMA_SRC_ROOT
"${IREE_ROOT_DIR}/third_party/vulkan_memory_allocator/src/"
)
@@ -199,6 +215,30 @@
iree_cc_library(
NAME
+ emulated_timeline_semaphore
+ HDRS
+ "emulated_timeline_semaphore.h"
+ SRCS
+ "emulated_timeline_semaphore.cc"
+ COPTS
+ "-DVK_NO_PROTOTYPES"
+ DEPS
+ ::handle_util
+ ::status_util
+ ::timepoint_util
+ absl::inlined_vector
+ absl::synchronization
+ absl::time
+ iree::base::intrusive_list
+ iree::base::status
+ iree::base::tracing
+ iree::hal::semaphore
+ Vulkan::Headers
+ PUBLIC
+)
+
+iree_cc_library(
+ NAME
extensibility_util
HDRS
"extensibility_util.h"
@@ -225,9 +265,11 @@
COPTS
"-DVK_NO_PROTOTYPES"
DEPS
+ absl::inlined_vector
absl::synchronization
absl::utility
iree::base::ref_ptr
+ iree::hal::command_queue
iree::hal::vulkan::dynamic_symbols
iree::hal::vulkan::extensibility_util
Vulkan::Headers
@@ -376,6 +418,30 @@
iree_cc_library(
NAME
+ serializing_command_queue
+ HDRS
+ "serializing_command_queue.h"
+ SRCS
+ "serializing_command_queue.cc"
+ COPTS
+ "-DVK_NO_PROTOTYPES"
+ DEPS
+ ::direct_command_buffer
+ ::emulated_timeline_semaphore
+ ::handle_util
+ ::status_util
+ ::timepoint_util
+ absl::inlined_vector
+ absl::synchronization
+ iree::base::status
+ iree::base::tracing
+ iree::hal::command_queue
+ Vulkan::Headers
+ PUBLIC
+)
+
+iree_cc_library(
+ NAME
status_util
HDRS
"status_util.h"
@@ -391,6 +457,26 @@
iree_cc_library(
NAME
+ timepoint_util
+ HDRS
+ "timepoint_util.h"
+ SRCS
+ "timepoint_util.cc"
+ COPTS
+ "-DVK_NO_PROTOTYPES"
+ DEPS
+ ::handle_util
+ absl::synchronization
+ iree::base::intrusive_list
+ iree::base::ref_ptr
+ iree::base::status
+ iree::base::tracing
+ Vulkan::Headers
+ PUBLIC
+)
+
+iree_cc_library(
+ NAME
vma_allocator
HDRS
"vma_allocator.h"
@@ -431,18 +517,21 @@
SRCS
"vulkan_device.cc"
COPTS
+ "-DIREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES=${IREE_EMULATE_TIMELINE_SEMAPHORE}"
"-DVK_NO_PROTOTYPES"
DEPS
::descriptor_pool_cache
::direct_command_buffer
::direct_command_queue
::dynamic_symbols
+ ::emulated_timeline_semaphore
::extensibility_util
::handle_util
::native_descriptor_set
::native_timeline_semaphore
::pipeline_cache
::pipeline_executable_layout
+ ::serializing_command_queue
::status_util
::vma_allocator
absl::inlined_vector
@@ -498,6 +587,7 @@
SRCS
"vulkan_driver_module.cc"
COPTS
+ "-DIREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES=${IREE_EMULATE_TIMELINE_SEMAPHORE}"
"-DVK_NO_PROTOTYPES"
DEPS
absl::flags
diff --git a/iree/hal/vulkan/emulated_timeline_semaphore.cc b/iree/hal/vulkan/emulated_timeline_semaphore.cc
new file mode 100644
index 0000000..9656310
--- /dev/null
+++ b/iree/hal/vulkan/emulated_timeline_semaphore.cc
@@ -0,0 +1,322 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/hal/vulkan/emulated_timeline_semaphore.h"
+
+#include "absl/container/inlined_vector.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "absl/utility/utility.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/status_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// static
+StatusOr<ref_ptr<Semaphore>> EmulatedTimelineSemaphore::Create(
+ ref_ptr<VkDeviceHandle> logical_device,
+ std::function<Status(Semaphore*)> on_signal,
+ std::function<void(Semaphore*)> on_failure,
+ ref_ptr<TimePointSemaphorePool> semaphore_pool, uint64_t initial_value) {
+ IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Create");
+ return make_ref<EmulatedTimelineSemaphore>(
+ std::move(logical_device), std::move(on_signal), std::move(on_failure),
+ std::move(semaphore_pool), initial_value);
+}
+
+EmulatedTimelineSemaphore::EmulatedTimelineSemaphore(
+ ref_ptr<VkDeviceHandle> logical_device,
+ std::function<Status(Semaphore*)> on_signal,
+ std::function<void(Semaphore*)> on_failure,
+ ref_ptr<TimePointSemaphorePool> semaphore_pool, uint64_t initial_value)
+ : signaled_value_(initial_value),
+ logical_device_(std::move(logical_device)),
+ on_signal_(std::move(on_signal)),
+ on_failure_(std::move(on_failure)),
+ semaphore_pool_(std::move(semaphore_pool)) {}
+
+EmulatedTimelineSemaphore::~EmulatedTimelineSemaphore() {
+ IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::dtor");
+ CHECK_OK(TryToAdvanceTimeline(UINT64_MAX).status());
+ absl::MutexLock lock(&mutex_);
+ CHECK(outstanding_semaphores_.empty())
+ << "Destroying an emulated timeline semaphore without first waiting on "
+ "outstanding signals";
+}
+
+StatusOr<uint64_t> EmulatedTimelineSemaphore::Query() {
+ RETURN_IF_ERROR(TryToAdvanceTimeline(UINT64_MAX).status());
+ uint64_t value = signaled_value_.load();
+ if (value == UINT64_MAX) {
+ absl::MutexLock lock(&mutex_);
+ return status_;
+ }
+ return value;
+}
+
+Status EmulatedTimelineSemaphore::Signal(uint64_t value) {
+ IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Signal");
+ auto signaled_value = signaled_value_.exchange(value);
+ // Make sure the previous signaled value is smaller than the new value.
+ CHECK(signaled_value < value)
+ << "Attempting to signal a timeline value out of order; trying " << value
+ << " but " << signaled_value << " already signaled";
+
+ // Inform the device to make progress given we have a new value signaled now.
+ RETURN_IF_ERROR(on_signal_(this));
+
+ return OkStatus();
+}
+
+Status EmulatedTimelineSemaphore::Wait(uint64_t value, absl::Time deadline) {
+ IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Wait");
+
+ VkFence fence = VK_NULL_HANDLE;
+ do {
+ IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Wait#loop");
+ // First try to advance the timeline without blocking to see whether we've
+ // already reached the desired value.
+ ASSIGN_OR_RETURN(bool reached_desired_value, TryToAdvanceTimeline(value));
+ if (reached_desired_value) return OkStatus();
+
+ // We must wait now. Find the first emulated time point that has a value >=
+ // the desired value so we can wait on its associated signal fence to make
+ // sure the timeline is advanced to the desired value.
+ absl::MutexLock lock(&mutex_);
+ auto semaphore = outstanding_semaphores_.begin();
+ for (; semaphore != outstanding_semaphores_.end(); ++semaphore) {
+ if ((*semaphore)->value >= value) break;
+ }
+ if (semaphore != outstanding_semaphores_.end()) {
+ if (!(*semaphore)->signal_fence) {
+ return InternalErrorBuilder(IREE_LOC)
+ << "Timeline should have a signal fence for the first time "
+ "point beyond the signaled value";
+ }
+ fence = (*semaphore)->signal_fence->value();
+ // Found; we can break the loop and proceed to waiting now.
+ break;
+ }
+ // TODO(antiagainst): figure out a better way instead of the busy loop here.
+ } while (absl::Now() < deadline);
+
+ if (fence == VK_NULL_HANDLE) {
+ return DeadlineExceededErrorBuilder(IREE_LOC)
+ << "Deadline reached when waiting timeline semaphore";
+ }
+
+ uint64_t timeout_nanos;
+ if (deadline == absl::InfiniteFuture()) {
+ timeout_nanos = UINT64_MAX;
+ } else if (deadline == absl::InfinitePast()) {
+ timeout_nanos = 0;
+ } else {
+ auto relative_nanos = absl::ToInt64Nanoseconds(deadline - absl::Now());
+ timeout_nanos = relative_nanos < 0 ? 0 : relative_nanos;
+ }
+
+ VK_RETURN_IF_ERROR(logical_device_->syms()->vkWaitForFences(
+ *logical_device_, /*fenceCount=*/1, &fence, /*waitAll=*/true,
+ timeout_nanos));
+
+ RETURN_IF_ERROR(TryToAdvanceTimeline(value).status());
+ return OkStatus();
+}
+
+void EmulatedTimelineSemaphore::Fail(Status status) {
+ IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Fail");
+ absl::MutexLock lock(&mutex_);
+ status_ = std::move(status);
+ signaled_value_.store(UINT64_MAX);
+}
+
+VkSemaphore EmulatedTimelineSemaphore::GetWaitSemaphore(
+ uint64_t value, const ref_ptr<TimePointFence>& wait_fence) {
+ IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::GetWaitSemaphore");
+ absl::MutexLock lock(&mutex_);
+
+ VkSemaphore semaphore = VK_NULL_HANDLE;
+ for (TimePointSemaphore* point : outstanding_semaphores_) {
+ if (point->value > value && point->wait_fence) {
+ point->wait_fence = add_ref(wait_fence);
+ semaphore = point->semaphore;
+ break;
+ }
+ }
+
+ return semaphore;
+}
+
+Status EmulatedTimelineSemaphore::CancelWaitSemaphore(VkSemaphore semaphore) {
+ IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::CancelWaitSemaphore");
+ absl::MutexLock lock(&mutex_);
+ for (TimePointSemaphore* point : outstanding_semaphores_) {
+ if (point->semaphore != semaphore) continue;
+
+ if (!point->wait_fence) {
+ return InvalidArgumentErrorBuilder(IREE_LOC)
+ << "Time point wasn't waited before";
+ }
+ point->wait_fence = nullptr;
+ return OkStatus();
+ }
+ return InvalidArgumentErrorBuilder(IREE_LOC)
+ << "No time point for the given semaphore";
+}
+
+StatusOr<VkSemaphore> EmulatedTimelineSemaphore::GetSignalSemaphore(
+ uint64_t value, const ref_ptr<TimePointFence>& signal_fence) {
+ IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::GetSignalSemaphore");
+
+ if (signaled_value_.load() >= value) {
+ return FailedPreconditionErrorBuilder(IREE_LOC)
+ << "Timeline semaphore already signaled past " << value;
+ }
+
+ absl::MutexLock lock(&mutex_);
+
+ auto insertion_point = outstanding_semaphores_.begin();
+ while (insertion_point != outstanding_semaphores_.end()) {
+ if ((*insertion_point)->value > value) break;
+ }
+
+ ASSIGN_OR_RETURN(TimePointSemaphore * semaphore, semaphore_pool_->Acquire());
+ semaphore->value = value;
+ semaphore->signal_fence = add_ref(signal_fence);
+ if (semaphore->wait_fence) {
+ return InternalErrorBuilder(IREE_LOC)
+ << "Newly acquired time point semaphore should not have waiters";
+ }
+ outstanding_semaphores_.insert(insertion_point, semaphore);
+
+ return semaphore->semaphore;
+}
+
+StatusOr<bool> EmulatedTimelineSemaphore::TryToAdvanceTimeline(
+ uint64_t to_upper_value) {
+ IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::TryToAdvanceTimeline");
+
+ // We hold the lock during the entire resolve process so that we can resolve
+ // to the furthest possible value.
+ absl::MutexLock lock(&mutex_);
+
+ uint64_t past_value = signaled_value_.load();
+
+ // Fast path for when already signaled past the desired value.
+ if (past_value >= to_upper_value) return true;
+
+ // The timeline has not signaled past the desired value and there is no
+ // binary semaphore pending on GPU yet: certainly the timeline cannot
+ // advance to the desired value.
+ if (outstanding_semaphores_.empty()) return false;
+
+ IntrusiveList<TimePointSemaphore> resolved_semaphores;
+
+ bool keep_resolving = true;
+ bool reached_desired_value = false;
+ while (keep_resolving && !outstanding_semaphores_.empty()) {
+ auto* semaphore = outstanding_semaphores_.front();
+
+ // If the current semaphore is for a value beyond our upper limit, then
+ // early exit so that we don't spend time dealing with signals we don't yet
+ // care about. This can prevent live lock where one thread is signaling
+ // fences as fast/faster than another thread can consume them.
+ if (semaphore->value > to_upper_value) {
+ keep_resolving = false;
+ reached_desired_value = true;
+ break;
+ }
+
+ // If the current semaphore is for a value not greater than the past
+ // signaled value, then we know it was signaled previously. But there might
+ // be a waiter on it on GPU.
+ if (semaphore->value <= past_value) {
+ if (semaphore->signal_fence) {
+ return InternalErrorBuilder(IREE_LOC)
+ << "Timeline should already signaled past this time point and "
+ "cleared the signal fence";
+ }
+
+ // If ther is no waiters, we can recycle this semaphore now. If there
+ // exists one waiter, then query its status and recycle on success. We
+ // only handle success status here. Others will be handled when the fence
+ // is checked for other semaphores' signaling status for the same queue
+ // submission.
+ if (!semaphore->wait_fence ||
+ semaphore->wait_fence->GetStatus() == VK_SUCCESS) {
+ semaphore->signal_fence = nullptr;
+ semaphore->wait_fence = nullptr;
+ outstanding_semaphores_.erase(semaphore);
+ resolved_semaphores.push_back(semaphore);
+ }
+
+ continue;
+ }
+
+ // This semaphore represents a value gerater than the known previously
+ // signaled value. We don't know its status so we need to really query now.
+
+ if (!semaphore->signal_fence) {
+ return InternalErrorBuilder(IREE_LOC)
+ << "The status of this time point in the timeline should still be "
+ "pending with a singal fence";
+ }
+ VkResult signal_status = semaphore->signal_fence->GetStatus();
+
+ switch (signal_status) {
+ case VK_SUCCESS:
+ signaled_value_.store(semaphore->value);
+ semaphore->signal_fence = nullptr;
+ // If no waiters, we can recycle this semaphore now.
+ if (!semaphore->wait_fence) {
+ semaphore->signal_fence = nullptr;
+ semaphore->wait_fence = nullptr;
+ outstanding_semaphores_.erase(semaphore);
+ resolved_semaphores.push_back(semaphore);
+ }
+ break;
+ case VK_NOT_READY:
+ // The fence has not been signaled yet so this is the furthest time
+ // point we can go in this timeline.
+ keep_resolving = false;
+ break;
+ default:
+ // Fence indicates an error (device lost, out of memory, etc).
+ // Propagate this back to our status (and thus any waiters).
+ // Since we only take the first error we find we skip all remaining
+ // fences.
+ keep_resolving = false;
+ semaphore->signal_fence = nullptr;
+ status_ = VkResultToStatus(signal_status);
+ signaled_value_.store(UINT64_MAX);
+ break;
+ }
+ }
+
+ semaphore_pool_->ReleaseResolved(&resolved_semaphores);
+ if (!status_.ok()) {
+ on_failure_(this);
+ semaphore_pool_->ReleaseUnresolved(&outstanding_semaphores_);
+ return status_;
+ }
+
+ return reached_desired_value;
+}
+
+} // namespace vulkan
+} // namespace hal
+} // namespace iree
diff --git a/iree/hal/vulkan/emulated_timeline_semaphore.h b/iree/hal/vulkan/emulated_timeline_semaphore.h
new file mode 100644
index 0000000..cc13a09
--- /dev/null
+++ b/iree/hal/vulkan/emulated_timeline_semaphore.h
@@ -0,0 +1,223 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_VULKAN_ENUMLATED_TIMELINE_SEMAPHORE_H_
+#define IREE_HAL_VULKAN_ENUMLATED_TIMELINE_SEMAPHORE_H_
+
+#include <vulkan/vulkan.h>
+
+#include <atomic>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "iree/base/intrusive_list.h"
+#include "iree/base/ref_ptr.h"
+#include "iree/base/status.h"
+#include "iree/hal/semaphore.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/timepoint_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// A timeline semaphore emulated via `VkFence`s and binary `VkSemaphore`s.
+//
+// Vulkan provides several explicit synchronization primitives: fences,
+// (binary/timeline) semaphores, events, pipeline barriers, and render passes.
+// See "6. Synchronization and Cache Control" of the Vulkan specification
+// for the details.
+//
+// Render passes are for graphics pipelines so IREE does not care about them.
+// Pipeline barriers synchronize control within a command buffer at a single
+// point. Fences, (binary/timeline) semaphores, and events are synchronization
+// primitives that have separate signal and wait operations. Events are more
+// fine-grained compared to fences and semaphores given that they can be
+// signaled or waited within a command buffer while fences and semaphores are
+// at queue submissions. Each of them have its usage requirements:
+//
+// * Fences must be signaled on GPU and waited on CPU. Fences must be reset
+// before reuse.
+// * Binary semaphores must be signaled on GPU and waited on GPU. They do not
+// support wait-before-signal submission order. More importantly, binary
+// semaphore wait also unsignals the semaphore. So binary semaphore signals
+// and waits should occur in discrete 1:1 pairs.
+// * Timeline semaphores can be signaled on CPU or GPU and waited on CPU or GPU.
+// They support wait-before-signal submission order. Timeline semaphores do
+// not need to be reset.
+//
+// It's clear that timeline semaphore is more flexible than fences and binary
+// semaphores: it unifies GPU and CPU synchronization with a single primitive.
+// But it's not always available: it requires the VK_KHR_timeline_semaphore
+// or Vulkan 1.2. When it's not available, it can be emulated via `VkFence`s
+// and binary `VkSemaphore`s. The emulation need to provide the functionality of
+// timeline semaphores and also not violate the usage requirements of `VkFence`s
+// and binary `VkSemaphore`s.
+//
+// The basic idea is to create a timeline object with time points to emulate the
+// timeline semaphore, which consists of a monotonically increasing 64-bit
+// integer value. Each time point represents a specific signaled/waited integer
+// value of the timeline semaphore; each time point can associate with binary
+// `VkSemaphore`s and/or `VkFence`s for emulating the synchronization.
+//
+// Concretely, for each of the possible signal -> wait scenarios timeline
+// semaphore supports:
+//
+// ### GPU -> GPU (via `vkQueueSubmit`)
+//
+// Each `vkQueueSubmit` can attach a `VkTimelineSemaphoreSubmitInfo` to describe
+// the timeline semaphore values signaled and waited. Each of the signaled value
+// will be a time point and emulated by a binary `VkSemaphore`. We submit the
+// binary `VkSemahpore`s to the GPU under the hood. For the waited values, the
+// situation is more complicated because of the differences between binary and
+// timeline semaphores:
+//
+// * Binary semaphore signal-wait relationship is strictly 1:1, unlike timeline
+// semaphore where we can have 1:N cases. This means for a specific binary
+// `VkSemaphore` used to emulate a signaled time point, we can have at most
+// one subsequent `vkQueueSubmit` waits on it. We need other mechanisms for
+// additional waits. A simple way is to involve the CPU and don't sumbit
+// the additional work to queue until the desired value is already signaled
+// past. This requires `VkFence`s for letting the CPU know the status of
+// GPU progress, but `VkFence` is needed anyway because of GPU -> CPU
+// synchronization.
+// * Binary semaphores does not support wait-before-signal submission order.
+// This means we need to put the submission into a self-managed queue if the
+// binary semaphores used to emulate the time points waited by the submission
+// are not submitted to GPU yet.
+//
+// ### GPU -> CPU (via `vkWaitSemaphores`)
+//
+// Without timeline semaphore, we need to use fences to let CPU wait on GPU
+// progress. So this direction can be emulated by `vkWaitFences`. It means we
+// need to associate a `VkFence` with the given waited timeline semaphores.
+// Because we don't know whether a particular `vkQueueSubmit` with timeline
+// semaphores will be later waited on by CPU beforehand, we need to bundle each
+// of them with a `VkFence` just in case they will be waited on later.
+//
+// ### CPU -> GPU (via `vkSignalSemaphore`)
+//
+// This direction can be handled by bumping the signaled timeline value and
+// scan the self-managed queue to submit more work to GPU if possible.
+//
+// ### CPU -> CPU (via `vkWaitSemaphores`)
+//
+// This is similar to CPU -> GPU direction; we just need to enable other threads
+// on CPU side and let them progress.
+//
+// The implementation is inspired by the Vulkan-ExtensionLayer project:
+// https://github.com/KhronosGroup/Vulkan-ExtensionLayer. We don't handle all
+// the aspects of the full spec though given that IREE only uses a subset of
+// synchronization primitives. So this should not be treated as a full
+// emulation of the Vulkan spec and thus does not substitute
+// Vulkan-ExtensionLayer.
+class EmulatedTimelineSemaphore final : public Semaphore {
+ public:
+ // Creates a timeline semaphore with the given |initial_value|.
+ static StatusOr<ref_ptr<Semaphore>> Create(
+ ref_ptr<VkDeviceHandle> logical_device,
+ std::function<Status(Semaphore*)> on_signal,
+ std::function<void(Semaphore*)> on_failure,
+ ref_ptr<TimePointSemaphorePool> semaphore_pool, uint64_t initial_value);
+
+ EmulatedTimelineSemaphore(ref_ptr<VkDeviceHandle> logical_device,
+ std::function<Status(Semaphore*)> on_signal,
+ std::function<void(Semaphore*)> on_failure,
+ ref_ptr<TimePointSemaphorePool> semaphore_pool,
+ uint64_t initialValue);
+
+ ~EmulatedTimelineSemaphore() override;
+
+ StatusOr<uint64_t> Query() override;
+
+ Status Signal(uint64_t value) override;
+
+ Status Wait(uint64_t value, absl::Time deadline) override;
+
+ void Fail(Status status) override;
+
+ // Gets a binary semaphore for waiting on the timeline to advance to the given
+ // |value|. The semaphore returned won't be waited by anyone else. Returns
+ // VK_NULL_HANDLE if no available semaphores for the given |value|.
+ // |wait_fence| is the fence associated with the queue submission that waiting
+ // on this semaphore.
+ VkSemaphore GetWaitSemaphore(uint64_t value,
+ const ref_ptr<TimePointFence>& wait_fence);
+
+ // Cancels the waiting attempt on the given binary |semaphore|. This allows
+ // the |semaphore| to be waited by others.
+ Status CancelWaitSemaphore(VkSemaphore semaphore);
+
+ // Gets a binary semaphore for signaling the timeline to the given |value|.
+ // |value| must be smaller than the current timeline value. |signal_fence| is
+ // the fence associated with the queue submission that signals this semaphore.
+ StatusOr<VkSemaphore> GetSignalSemaphore(
+ uint64_t value, const ref_ptr<TimePointFence>& signal_fence);
+
+ private:
+ // Tries to advance the timeline to the given |to_upper_value| without
+ // blocking and returns whether the |to_upper_value| is reached.
+ StatusOr<bool> TryToAdvanceTimeline(uint64_t to_upper_value)
+ ABSL_LOCKS_EXCLUDED(mutex_);
+
+ std::atomic<uint64_t> signaled_value_;
+
+ ref_ptr<VkDeviceHandle> logical_device_;
+
+ // Callback to inform that this timeline semaphore has signaled a new value.
+ std::function<Status(Semaphore*)> on_signal_;
+
+ // Callback to inform that this timeline semaphore has encountered a failure.
+ std::function<void(Semaphore*)> on_failure_;
+
+ ref_ptr<TimePointSemaphorePool> semaphore_pool_;
+
+ mutable absl::Mutex mutex_;
+
+ // A list of outstanding semaphores used to emulate time points.
+ //
+ // The life time of each semaphore is in one of the following state:
+ //
+ // * Unused state: value = UINT64_MAX, signal/wait fence = nullptr. This is
+ // the state of the semaphore when it's initially acquired from the pool and
+ // not put in the queue for emulating a time point yet.
+ // * Pending state: signaled value < value < UINT64_MAX, signal fence =
+ // <some-fence>, wait fence == nullptr. This is the state of the semaphore
+ // when it's put into the GPU queue for emulating a time point.
+ // * Pending and waiting state: signaled value < value < UINT64_MAX, signal
+ // fence = <some-fence>, wait fence == <some-fence>. This is the state of
+ // the semaphore when it's put into the GPU queue for emulating a time
+ // point and there is another queue submission waiting on it in GPU.
+ // * Signaled and not ever waited state: value <= signaled value, singal/wait
+ // fence = nullptr. This is the state of the semaphore when we know it's
+ // already signaled on GPU and there is no waiters for it.
+ // * Signaled and waiting state: value <= signaled value, signal fence =
+ // nullptr, wait fence = <some-fence>. This is the state of the semaphore
+ // when we know it's already signaled on GPU and there is still one queue
+ // submission on GPU is waiting for it.
+ IntrusiveList<TimePointSemaphore> outstanding_semaphores_
+ ABSL_GUARDED_BY(mutex_);
+
+ // NOTE: We only need to access this status (and thus take the lock) when we
+ // want to either signal failure or query the status in the case of the
+ // semaphore being set to UINT64_MAX.
+ Status status_ ABSL_GUARDED_BY(mutex_);
+};
+
+} // namespace vulkan
+} // namespace hal
+} // namespace iree
+
+#endif // IREE_HAL_VULKAN_ENUMLATED_TIMELINE_SEMAPHORE_H_
diff --git a/iree/hal/vulkan/serializing_command_queue.cc b/iree/hal/vulkan/serializing_command_queue.cc
new file mode 100644
index 0000000..9d6d24c
--- /dev/null
+++ b/iree/hal/vulkan/serializing_command_queue.cc
@@ -0,0 +1,355 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/hal/vulkan/serializing_command_queue.h"
+
+#include <memory>
+
+#include "absl/time/clock.h"
+#include "absl/types/span.h"
+#include "iree/base/memory.h"
+#include "iree/base/source_location.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/command_buffer.h"
+#include "iree/hal/command_queue.h"
+#include "iree/hal/semaphore.h"
+#include "iree/hal/vulkan/direct_command_buffer.h"
+#include "iree/hal/vulkan/emulated_timeline_semaphore.h"
+#include "iree/hal/vulkan/status_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+namespace {
+
+// Tries to prepare all necessary binary `VKSemaphore`s for emulating the time
+// points as specified in the given submission |batch| and returns true if
+// possible so that the |batch| is ready to be submitted to GPU.
+// |wait_semaphores| and |signal_semaphores| will be filled with the binary
+// `VkSemaphores` on success. |fence| is the fence associated with the
+// submission |batch|.
+StatusOr<bool> TryToPrepareSemaphores(
+ const SubmissionBatch& batch, const ref_ptr<TimePointFence>& fence,
+ absl::InlinedVector<VkSemaphore, 4>* wait_semaphores,
+ absl::InlinedVector<VkSemaphore, 4>* signal_semaphores) {
+ IREE_TRACE_SCOPE0("TryToPrepareSemaphores");
+
+ wait_semaphores->clear();
+ for (const auto& timeline_semaphore : batch.wait_semaphores) {
+ // Query first to progress this timeline semaphore to the furthest.
+ ASSIGN_OR_RETURN(auto signaled_value,
+ timeline_semaphore.semaphore->Query());
+
+ // If it's already signaled to a value greater than we require here,
+ // we can just ignore this semaphore now.
+ if (signaled_value >= timeline_semaphore.value) continue;
+
+ // SerializingCommandQueue only works with EmulatedTimelineSemaphore.
+ auto* emulated_semaphore =
+ static_cast<EmulatedTimelineSemaphore*>(timeline_semaphore.semaphore);
+
+ // Otherwise try to get a binary semaphore for this time point so that
+ // we can wait on.
+ VkSemaphore binary_semaphore =
+ emulated_semaphore->GetWaitSemaphore(timeline_semaphore.value, fence);
+
+ if (binary_semaphore == VK_NULL_HANDLE) {
+ // We cannot wait on this time point yet: there are no previous semaphores
+ // submitted to the GPU that can signal a value greater than what's
+ // desired here.
+
+ // Cancel the wait so others may make progress.
+ for (VkSemaphore semaphore : *wait_semaphores) {
+ RETURN_IF_ERROR(emulated_semaphore->CancelWaitSemaphore(semaphore));
+ }
+
+ // This batch cannot be submitted to GPU yet.
+ return false;
+ }
+
+ wait_semaphores->push_back(binary_semaphore);
+ }
+
+ // We've collected all necessary binary semaphores for each timeline we need
+ // to wait on. Now prepare binary semaphores for signaling.
+ signal_semaphores->clear();
+ for (const auto& timeline_semaphore : batch.signal_semaphores) {
+ // SerializingCommandQueue only works with EmulatedTimelineSemaphore.
+ auto* emulated_semaphore =
+ static_cast<EmulatedTimelineSemaphore*>(timeline_semaphore.semaphore);
+
+ ASSIGN_OR_RETURN(auto binary_semaphore,
+ emulated_semaphore->GetSignalSemaphore(
+ timeline_semaphore.value, fence));
+ signal_semaphores->push_back(binary_semaphore);
+ }
+
+ // Good to submit!
+ return true;
+}
+
+// Prepares `VkSubmitInfo` to submit the given list of |command_buffers| that
+// waiting on |wait_semaphores| and signalling |signal_semaphores|. Necessary
+// structures are allocated from |arena| and the result `VkSubmitInfo` is
+// written to |submit_info|.
+void PrepareSubmitInfo(
+ const absl::InlinedVector<VkSemaphore, 4>& wait_semaphores,
+ absl::Span<CommandBuffer* const> command_buffers,
+ const absl::InlinedVector<VkSemaphore, 4>& signal_semaphores,
+ VkSubmitInfo* submit_info, Arena* arena) {
+ IREE_TRACE_SCOPE0("PrepareSubmitInfo");
+
+ // TODO(benvanik): see if we can go to finer-grained stages.
+ // For example, if this was just queue ownership transfers then we can use
+ // the pseudo-stage of VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT.
+ VkPipelineStageFlags dst_stage_mask =
+ VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+ auto wait_semaphore_handles =
+ arena->AllocateSpan<VkSemaphore>(wait_semaphores.size());
+ auto wait_dst_stage_masks =
+ arena->AllocateSpan<VkPipelineStageFlags>(wait_semaphores.size());
+ for (int i = 0, e = wait_semaphores.size(); i < e; ++i) {
+ wait_semaphore_handles[i] = wait_semaphores[i];
+ wait_dst_stage_masks[i] = dst_stage_mask;
+ }
+
+ auto signal_semaphore_handles =
+ arena->AllocateSpan<VkSemaphore>(signal_semaphores.size());
+ for (int i = 0, e = signal_semaphores.size(); i < e; ++i) {
+ signal_semaphore_handles[i] = signal_semaphores[i];
+ }
+
+ auto command_buffer_handles =
+ arena->AllocateSpan<VkCommandBuffer>(command_buffers.size());
+ for (int i = 0, e = command_buffers.size(); i < e; ++i) {
+ const auto& command_buffer = command_buffers[i];
+ auto* direct_command_buffer =
+ static_cast<DirectCommandBuffer*>(command_buffer->impl());
+ command_buffer_handles[i] = direct_command_buffer->handle();
+ }
+
+ submit_info->sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+ submit_info->pNext = nullptr;
+ submit_info->waitSemaphoreCount = wait_semaphore_handles.size();
+ submit_info->pWaitSemaphores = wait_semaphore_handles.data();
+ submit_info->pWaitDstStageMask = wait_dst_stage_masks.data();
+ submit_info->commandBufferCount = command_buffer_handles.size();
+ submit_info->pCommandBuffers = command_buffer_handles.data();
+ submit_info->signalSemaphoreCount = signal_semaphore_handles.size();
+ submit_info->pSignalSemaphores = signal_semaphore_handles.data();
+}
+
+} // namespace
+
+SerializingCommandQueue::SerializingCommandQueue(
+ std::string name, CommandCategoryBitfield supported_categories,
+ const ref_ptr<VkDeviceHandle>& logical_device,
+ const ref_ptr<TimePointFencePool>& fence_pool, VkQueue queue)
+ : CommandQueue(std::move(name), supported_categories),
+ logical_device_(add_ref(logical_device)),
+ fence_pool_(add_ref(fence_pool)),
+ queue_(queue) {}
+
+SerializingCommandQueue::~SerializingCommandQueue() {
+ IREE_TRACE_SCOPE0("SerializingCommandQueue::dtor");
+ absl::MutexLock lock(&mutex_);
+ syms()->vkQueueWaitIdle(queue_);
+}
+
+Status SerializingCommandQueue::Submit(
+ absl::Span<const SubmissionBatch> batches) {
+ IREE_TRACE_SCOPE0("SerializingCommandQueue::Submit");
+
+ absl::MutexLock lock(&mutex_);
+ for (const auto& batch : batches) {
+ // Grab a fence for this submission first. This will be used to check the
+ // progress of emulated timeline semaphores later.
+ ASSIGN_OR_RETURN(auto fence, fence_pool_->Acquire());
+ deferred_submissions_.push_back(
+ std::make_unique<FencedSubmission>(batch, std::move(fence)));
+ }
+
+ return ProcessDeferredSubmissions().status();
+}
+
+StatusOr<bool> SerializingCommandQueue::ProcessDeferredSubmissions() {
+ IREE_TRACE_SCOPE0("SerializingCommandQueue::ProcessDeferredSubmissions");
+
+ // Prepare `VkSubmitInfo`s for all submissions we are able to submit.
+
+ // Note that we must keep all arrays referenced alive until submission
+ // completes and since there are a bunch of them we use an arena.
+ Arena arena(4 * 1024);
+
+ absl::InlinedVector<VkSubmitInfo, 4> submit_infos;
+ absl::InlinedVector<VkFence, 4> submit_fences;
+
+ absl::InlinedVector<VkSemaphore, 4> wait_semaphores;
+ absl::InlinedVector<VkSemaphore, 4> signal_semaphores;
+
+ // A list of submissions that still needs to be deferred.
+ IntrusiveList<std::unique_ptr<FencedSubmission>> remaining_submissions;
+
+ while (!deferred_submissions_.empty()) {
+ wait_semaphores.clear();
+ signal_semaphores.clear();
+
+ auto submission = deferred_submissions_.take(deferred_submissions_.front());
+ const SubmissionBatch& batch = submission->batch;
+ ref_ptr<TimePointFence> fence(std::move(submission->fence));
+
+ ASSIGN_OR_RETURN(bool ready_to_submit,
+ TryToPrepareSemaphores(batch, fence, &wait_semaphores,
+ &signal_semaphores));
+
+ if (ready_to_submit) {
+ submit_infos.emplace_back();
+ PrepareSubmitInfo(wait_semaphores, batch.command_buffers,
+ signal_semaphores, &submit_infos.back(), &arena);
+ submit_fences.push_back(fence->value());
+ pending_fences_.emplace_back(std::move(fence));
+ } else {
+ // We need to defer the submission until later.
+ remaining_submissions.push_back(std::move(submission));
+ }
+ }
+
+ if (submit_infos.empty()) return false;
+
+ auto infos = arena.AllocateSpan<VkSubmitInfo>(submit_infos.size());
+ for (int i = 0, e = submit_infos.size(); i < e; ++i) {
+ infos[i] = submit_infos[i];
+ }
+
+ // Note: We might be able to batch the submission but it involves non-trivial
+ // fence handling. We can handle that if really needed.
+ for (int i = 0, e = submit_infos.size(); i < e; ++i) {
+ VK_RETURN_IF_ERROR(syms()->vkQueueSubmit(
+ queue_, /*submitCount=*/1, &submit_infos[i], submit_fences[i]));
+ }
+
+ while (!remaining_submissions.empty()) {
+ deferred_submissions_.push_back(
+ remaining_submissions.take(remaining_submissions.front()));
+ }
+
+ return true;
+}
+
+Status SerializingCommandQueue::WaitIdle(absl::Time deadline) {
+ absl::MutexLock lock(&mutex_);
+
+ if (deadline == absl::InfiniteFuture()) {
+ IREE_TRACE_SCOPE0("SerializingCommandQueue::WaitIdle#vkQueueWaitIdle");
+ // Fast path for using vkQueueWaitIdle, which is usually cheaper (as it
+ // requires fewer calls into the driver).
+
+ // Complete all pending work on the queue.
+ VK_RETURN_IF_ERROR(syms()->vkQueueWaitIdle(queue_));
+ pending_fences_.clear();
+
+ // Submit and complete all deferred work.
+ while (!deferred_submissions_.empty()) {
+ ASSIGN_OR_RETURN(bool work_submitted, ProcessDeferredSubmissions());
+ if (work_submitted) {
+ VK_RETURN_IF_ERROR(syms()->vkQueueWaitIdle(queue_));
+ pending_fences_.clear();
+ }
+ }
+
+ return OkStatus();
+ }
+
+ IREE_TRACE_SCOPE0("SerializingCommandQueue::WaitIdle#Fence");
+
+ // Keep trying to submit more workload to the GPU until reaching the deadline.
+ do {
+ RETURN_IF_ERROR(ProcessDeferredSubmissions().status());
+
+ uint64_t timeout_nanos;
+ if (deadline == absl::InfinitePast()) {
+ // Do not wait.
+ timeout_nanos = 0;
+ } else {
+ // Convert to relative time in nanoseconds.
+ // The implementation may not wait with this granularity (like, by
+ // 10000x).
+ absl::Time now = absl::Now();
+ if (deadline < now) {
+ return DeadlineExceededErrorBuilder(IREE_LOC)
+ << "Deadline exceeded waiting for idle";
+ }
+ timeout_nanos =
+ static_cast<uint64_t>(absl::ToInt64Nanoseconds(deadline - now));
+ }
+
+ if (pending_fences_.empty()) continue;
+
+ std::vector<VkFence> fences;
+ fences.reserve(pending_fences_.size());
+ for (const auto& fence : pending_fences_) fences.push_back(fence->value());
+
+ VkResult result =
+ syms()->vkWaitForFences(*logical_device_, fences.size(), fences.data(),
+ /*waitAll=*/VK_TRUE, timeout_nanos);
+
+ switch (result) {
+ case VK_SUCCESS:
+ pending_fences_.clear();
+ break;
+ case VK_TIMEOUT:
+ return DeadlineExceededErrorBuilder(IREE_LOC)
+ << "Deadline exceeded waiting for idle";
+ default:
+ return VkResultToStatus(result);
+ }
+ // As long as there is submitted or deferred work still pending.
+ } while (!pending_fences_.empty() || !deferred_submissions_.empty());
+
+ return OkStatus();
+}
+
+Status SerializingCommandQueue::AdvanceQueueSubmission() {
+ absl::MutexLock lock(&mutex_);
+ // The returned value just indicates whether there were newly ready
+ // submissions gotten submitted to the GPU. Other callers might be
+ // interested in that information but for this API we just want to advance
+ // queue submisison if possible. So we ignore it here.
+ ASSIGN_OR_RETURN(std::ignore, ProcessDeferredSubmissions());
+ return OkStatus();
+}
+
+void SerializingCommandQueue::AbortQueueSubmission() {
+ absl::MutexLock lock(&mutex_);
+
+ // We have fences in deferred_submissions_ but they are not submitted to GPU
+ // yet so we don't need to reset.
+ deferred_submissions_.clear();
+
+ std::vector<VkFence> fences;
+ fences.reserve(pending_fences_.size());
+ for (const auto& fence : pending_fences_) fences.push_back(fence->value());
+
+ syms()->vkWaitForFences(*logical_device_, fences.size(), fences.data(),
+ /*waitAll=*/VK_TRUE, /*timeout=*/UINT64_MAX);
+ // Clear the list. Fences will be automatically returned back to the queue
+ // after refcount reaches 0.
+ pending_fences_.clear();
+}
+
+} // namespace vulkan
+} // namespace hal
+} // namespace iree
diff --git a/iree/hal/vulkan/serializing_command_queue.h b/iree/hal/vulkan/serializing_command_queue.h
new file mode 100644
index 0000000..e38643b
--- /dev/null
+++ b/iree/hal/vulkan/serializing_command_queue.h
@@ -0,0 +1,111 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_VULKAN_SERIALIZING_COMMAND_QUEUE_H_
+#define IREE_HAL_VULKAN_SERIALIZING_COMMAND_QUEUE_H_
+
+#include <vulkan/vulkan.h>
+
+#include <memory>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "iree/base/intrusive_list.h"
+#include "iree/base/ref_ptr.h"
+#include "iree/base/status.h"
+#include "iree/hal/command_buffer.h"
+#include "iree/hal/command_queue.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/timepoint_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// A command queue that potentially defers and serializes command buffer
+// submission to the GPU.
+//
+// This command queue is designed to be used together with emulated timeline
+// semaphores. Timeline semaphores can follow wait-before-signal submission
+// order but binary `VkSemaphore` cannot. So when emulating timeline semaphores
+// with binary `VkSemaphore`s and `VkFence`s, we need to make sure no
+// wait-before-signal submission order occur for binary `VkSemaphore`s. The way
+// to enforce that is to defer the submission until we can be certain that the
+// `VkSemaphore`s emulating time points in the timeline are all *submitted* to
+// the GPU.
+class SerializingCommandQueue final : public CommandQueue {
+ public:
+ SerializingCommandQueue(std::string name,
+ CommandCategoryBitfield supported_categories,
+ const ref_ptr<VkDeviceHandle>& logical_device,
+ const ref_ptr<TimePointFencePool>& fence_pool,
+ VkQueue queue);
+ ~SerializingCommandQueue() override;
+
+ const ref_ptr<DynamicSymbols>& syms() const {
+ return logical_device_->syms();
+ }
+
+ Status Submit(absl::Span<const SubmissionBatch> batches) override;
+
+ Status WaitIdle(absl::Time deadline) override;
+
+ // Releases all deferred submissions ready to submit to the GPU.
+ Status AdvanceQueueSubmission();
+
+ // Aborts all deferred submissions and waits for submitted work to complete.
+ void AbortQueueSubmission();
+
+ private:
+ // A submission batch together with the fence to singal its status.
+ struct FencedSubmission : IntrusiveLinkBase<void> {
+ SubmissionBatch batch;
+ ref_ptr<TimePointFence> fence;
+
+ FencedSubmission(const SubmissionBatch& batch,
+ ref_ptr<TimePointFence> fence)
+ : batch(batch), fence(std::move(fence)) {}
+ };
+
+ // Processes deferred submissions in this queue and returns whether there are
+ // new workload submitted to the GPU if no errors happen.
+ StatusOr<bool> ProcessDeferredSubmissions()
+ ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ ref_ptr<VkDeviceHandle> logical_device_;
+
+ ref_ptr<TimePointFencePool> fence_pool_;
+
+ mutable absl::Mutex mutex_;
+
+ // A list of fences that are submitted to GPU.
+ absl::InlinedVector<ref_ptr<TimePointFence>, 4> pending_fences_
+ ABSL_GUARDED_BY(mutex_);
+ // A list of deferred submissions that haven't been submitted to GPU.
+ IntrusiveList<std::unique_ptr<FencedSubmission>> deferred_submissions_
+ ABSL_GUARDED_BY(mutex_);
+
+ // VkQueue needs to be externally synchronized.
+ VkQueue queue_ ABSL_GUARDED_BY(mutex_);
+};
+
+} // namespace vulkan
+} // namespace hal
+} // namespace iree
+
+#endif // IREE_HAL_VULKAN_SERIALIZING_COMMAND_QUEUE_H_
diff --git a/iree/hal/vulkan/timepoint_util.cc b/iree/hal/vulkan/timepoint_util.cc
new file mode 100644
index 0000000..c212856
--- /dev/null
+++ b/iree/hal/vulkan/timepoint_util.cc
@@ -0,0 +1,226 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/hal/vulkan/timepoint_util.h"
+
+#include <memory>
+
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "absl/utility/utility.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/status_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// static
+void TimePointFence::Delete(TimePointFence* ptr) {
+ ptr->pool()->ReleaseResolved(ptr);
+}
+
+VkResult TimePointFence::GetStatus() {
+ absl::MutexLock lock(&status_mutex_);
+ if (status_ == VK_NOT_READY) {
+ const auto& device = pool()->logical_device();
+ status_ = device->syms()->vkGetFenceStatus(*device, fence_);
+ }
+ return status_;
+}
+
+// static
+StatusOr<ref_ptr<TimePointFencePool>> TimePointFencePool::Create(
+ ref_ptr<VkDeviceHandle> logical_device) {
+ IREE_TRACE_SCOPE0("TimePointFencePool::Create");
+ ref_ptr<TimePointFencePool> pool(
+ new TimePointFencePool(std::move(logical_device)));
+ RETURN_IF_ERROR(pool->PreallocateFences());
+ return pool;
+}
+
+TimePointFencePool::~TimePointFencePool() {
+ IREE_TRACE_SCOPE0("TimePointFencePool::dtor");
+
+ absl::MutexLock lock(&mutex_);
+ int free_count = 0;
+ for (auto* fence : free_fences_) {
+ syms()->vkDestroyFence(*logical_device_, fence->value(),
+ logical_device_->allocator());
+ ++free_count;
+ }
+ DCHECK_EQ(free_count, kMaxInFlightFenceCount);
+ free_fences_.clear();
+}
+
+StatusOr<ref_ptr<TimePointFence>> TimePointFencePool::Acquire() {
+ IREE_TRACE_SCOPE0("TimePointFencePool::Acquire");
+
+ absl::MutexLock lock(&mutex_);
+ if (free_fences_.empty()) {
+ return ResourceExhaustedErrorBuilder(IREE_LOC)
+ << "Fence pool out of free fences";
+ }
+
+ auto* fence = free_fences_.front();
+ free_fences_.pop_front();
+ return add_ref(fence);
+}
+
+void TimePointFencePool::ReleaseResolved(TimePointFence* fence) {
+ IREE_TRACE_SCOPE0("TimePointFencePool::ReleaseResolved");
+ VkFence f = fence->value();
+ syms()->vkResetFences(*logical_device_, 1, &f);
+ absl::MutexLock lock(&mutex_);
+ free_fences_.push_back(fence);
+}
+
+TimePointFencePool::TimePointFencePool(ref_ptr<VkDeviceHandle> logical_device)
+ : logical_device_(std::move(logical_device)) {}
+
+const ref_ptr<DynamicSymbols>& TimePointFencePool::syms() const {
+ return logical_device_->syms();
+}
+
+Status TimePointFencePool::PreallocateFences() {
+ IREE_TRACE_SCOPE0("TimePointFencePool::PreallocateFences");
+
+ VkFenceCreateInfo create_info;
+ create_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+ create_info.pNext = nullptr;
+ create_info.flags = 0;
+
+ std::array<std::unique_ptr<TimePointFence>, kMaxInFlightFenceCount> fences;
+ {
+ absl::MutexLock lock(&mutex_);
+ for (int i = 0; i < fences.size(); ++i) {
+ VkFence fence = VK_NULL_HANDLE;
+ VK_RETURN_IF_ERROR(syms()->vkCreateFence(*logical_device_, &create_info,
+ logical_device_->allocator(),
+ &fence));
+ fences[i].reset(new TimePointFence(this, fence));
+ }
+ }
+
+ for (int i = 0; i < fences.size(); ++i) {
+ // The `TimePointFence`s was created with an initial ref-count of one.
+ // Decrease explicitly to zero so that later we can rely on the ref-count
+ // reaching zero to auto-release the `TimePointFence` back to the free
+ // list. As a nice side effect, this will also initialize the free list
+ // with all newly created fences.
+ // TODO: Might want to avoid acquiring and releasing the mutex for each
+ // fence.
+ fences[i].release()->ReleaseReference();
+ }
+
+ return OkStatus();
+}
+
+// static
+StatusOr<ref_ptr<TimePointSemaphorePool>> TimePointSemaphorePool::Create(
+ ref_ptr<VkDeviceHandle> logical_device) {
+ IREE_TRACE_SCOPE0("TimePointSemaphorePool::Create");
+ ref_ptr<TimePointSemaphorePool> pool(
+ new TimePointSemaphorePool(std::move(logical_device)));
+ RETURN_IF_ERROR(pool->PreallocateSemaphores());
+ return pool;
+}
+
+TimePointSemaphorePool::~TimePointSemaphorePool() {
+ IREE_TRACE_SCOPE0("TimePointSemaphorePool::dtor");
+
+ absl::MutexLock lock(&mutex_);
+
+ DCHECK_EQ(free_semaphores_.size(), kMaxInFlightSemaphoreCount);
+ free_semaphores_.clear();
+
+ for (auto& semaphore : storage_) {
+ syms()->vkDestroySemaphore(*logical_device_, semaphore.semaphore,
+ logical_device_->allocator());
+ }
+}
+
+StatusOr<TimePointSemaphore*> TimePointSemaphorePool::Acquire() {
+ IREE_TRACE_SCOPE0("TimePointSemaphorePool::Acquire");
+
+ absl::MutexLock lock(&mutex_);
+ if (free_semaphores_.empty()) {
+ return ResourceExhaustedErrorBuilder(IREE_LOC)
+ << "Semaphore pool out of free semaphores";
+ }
+
+ auto* semaphore = free_semaphores_.front();
+ free_semaphores_.pop_front();
+ return semaphore;
+}
+
+void TimePointSemaphorePool::ReleaseResolved(
+ IntrusiveList<TimePointSemaphore>* semaphores) {
+ IREE_TRACE_SCOPE0("TimePointSemaphorePool::ReleaseResolved");
+
+ for (auto* semaphore : *semaphores) {
+ DCHECK(!semaphore->signal_fence && !semaphore->wait_fence);
+ semaphore->value = UINT64_MAX;
+ }
+
+ absl::MutexLock lock(&mutex_);
+ free_semaphores_.merge_from(semaphores);
+}
+
+void TimePointSemaphorePool::ReleaseUnresolved(
+ IntrusiveList<TimePointSemaphore>* semaphores) {
+ IREE_TRACE_SCOPE0("TimePointSemaphorePool::ReleaseUnresolved");
+
+ for (auto* semaphore : *semaphores) {
+ semaphore->signal_fence = nullptr;
+ semaphore->wait_fence = nullptr;
+ semaphore->value = UINT64_MAX;
+ }
+
+ absl::MutexLock lock(&mutex_);
+ free_semaphores_.merge_from(semaphores);
+}
+
+TimePointSemaphorePool::TimePointSemaphorePool(
+ ref_ptr<VkDeviceHandle> logical_device)
+ : logical_device_(std::move(logical_device)) {}
+
+const ref_ptr<DynamicSymbols>& TimePointSemaphorePool::syms() const {
+ return logical_device_->syms();
+}
+
+Status TimePointSemaphorePool::PreallocateSemaphores() {
+ IREE_TRACE_SCOPE0("TimePointSemaphorePool::PreallocateSemaphores");
+
+ VkSemaphoreCreateInfo create_info;
+ create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+ create_info.pNext = nullptr;
+ create_info.flags = 0;
+
+ absl::MutexLock lock(&mutex_);
+ for (int i = 0; i < kMaxInFlightSemaphoreCount; ++i) {
+ auto* semaphore = &storage_[i];
+ VK_RETURN_IF_ERROR(syms()->vkCreateSemaphore(*logical_device_, &create_info,
+ logical_device_->allocator(),
+ &semaphore->semaphore));
+ free_semaphores_.push_back(semaphore);
+ }
+
+ return OkStatus();
+}
+
+} // namespace vulkan
+} // namespace hal
+} // namespace iree
diff --git a/iree/hal/vulkan/timepoint_util.h b/iree/hal/vulkan/timepoint_util.h
new file mode 100644
index 0000000..e2cb7df
--- /dev/null
+++ b/iree/hal/vulkan/timepoint_util.h
@@ -0,0 +1,210 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_VULKAN_TIMEPOINT_UTIL_H_
+#define IREE_HAL_VULKAN_TIMEPOINT_UTIL_H_
+
+#include <vulkan/vulkan.h>
+
+#include <atomic>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "iree/base/intrusive_list.h"
+#include "iree/base/ref_ptr.h"
+#include "iree/base/status.h"
+#include "iree/hal/vulkan/handle_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+class TimePointFencePool;
+class TimePointSemaphorePool;
+
+// A fence used for tracking progress of timeline semaphores.
+//
+// Each queue submission gets a new `VkFence` associated with it so that we can
+// later query the `VkFence` on CPU to know what time points were signaled for
+// timeline semaphores.
+//
+// Ref-counting allows the fence to be associated with multiple time points from
+// different timelines without worrying about ownership complexity.
+//
+// This is expected to used together with `TimePointFencePool` and must be
+// externally synchronized via `TimePointFencePool`'s mutex.
+class TimePointFence final : public RefObject<TimePointFence>,
+ public IntrusiveLinkBase<void> {
+ public:
+ TimePointFence(TimePointFencePool* pool, VkFence fence)
+ : pool_(pool), fence_(fence) {}
+
+ TimePointFence(TimePointFence&& that) = delete;
+ TimePointFence& operator=(TimePointFence&&) = delete;
+
+ TimePointFence(const TimePointFence&) = delete;
+ TimePointFence& operator=(const TimePointFence&) = delete;
+
+ // Returns this fence to the pool on destruction.
+ static void Delete(TimePointFence* ptr);
+
+ VkFence value() const noexcept { return fence_; }
+ operator VkFence() const noexcept { return fence_; }
+
+ // Gets the status of this fence object. This might issue an Vulkan API call
+ // under the hood.
+ VkResult GetStatus();
+
+ // Returns the pool from which this fence comes.
+ TimePointFencePool* pool() const { return pool_; }
+
+ private:
+ // The pool from which this fence comes.
+ TimePointFencePool* pool_;
+
+ // Allocated fence that associated with a bunch of time point(s) of
+ // timeline(s). This is passed to queue submission so that we can track the
+ // timeline(s) progress on CPU and schedule work.
+ VkFence fence_;
+
+ // The fence's status.
+ absl::Mutex status_mutex_;
+ VkResult status_ ABSL_GUARDED_BY(status_mutex_) = VK_NOT_READY;
+};
+
+// A semaphore used for emulating a specific time point of timeline semaphores.
+//
+// Each signaled time point in a timeline semaphore is emulated with a new
+// binary `VkSemaphore` associated with queue submission. These time point
+// semaphores are stored in `EmulatedTimelineSemaphore` to quickly scan and
+// process signaled values.
+//
+// This is expected to used together with `TimePointSemaphorePool` and
+// `EmulatedTimelineSemaphore` and must be externally synchronized via their
+// mutexes.
+struct TimePointSemaphore final : public IntrusiveLinkBase<void> {
+ // Allocated binary semaphore that represents a time point in the timeline.
+ // This is passed to queue submission.
+ VkSemaphore semaphore = VK_NULL_HANDLE;
+
+ // Value of the timeline should be at when the binary semaphore is signaled.
+ uint64_t value = UINT64_MAX;
+
+ // The fence associated with the queue submission signaling this semaphore.
+ // nullptr means this binary semaphore has not been submitted to GPU.
+ ref_ptr<TimePointFence> signal_fence = nullptr;
+
+ // The fence associated with the queue submission waiting this semaphore.
+ // nullptr means this binary semaphore has not been waited by any queue
+ // submission.
+ ref_ptr<TimePointFence> wait_fence = nullptr;
+};
+
+// A pool of `VkFence`s that can be used by `EmulatedTimelineSemaphore` to track
+// timeline progress on CPU. Each `VkFence` can be used to query the status of
+// all the semaphores in the same submission to a `VkQueue`.
+class TimePointFencePool final : public RefObject<TimePointFencePool> {
+ public:
+ static constexpr int kMaxInFlightFenceCount = 32;
+
+ // Creates a new pool and pre-allocates `kMaxInFlightFenceCount` fences.
+ static StatusOr<ref_ptr<TimePointFencePool>> Create(
+ ref_ptr<VkDeviceHandle> logical_device);
+
+ ~TimePointFencePool();
+
+ // Acquires a fence from the pool for use by the caller. The fence is
+ // guaranteed to be in unsignaled state and not in-flight on GPU.
+ //
+ // Returns RESOURCE_EXHAUSTED if the pool has no more available fences.
+ // Callers are expected to handle this by waiting on previous fences or for
+ // complete device idle. Yes, that's as bad as it sounds, and if we start
+ // seeing that we should bump up the max count.
+ StatusOr<ref_ptr<TimePointFence>> Acquire();
+
+ // Releases one fence back to the pool. The fence must either be signaled or
+ // not be in flight on GPU.
+ void ReleaseResolved(TimePointFence* fence);
+
+ const ref_ptr<VkDeviceHandle>& logical_device() const {
+ return logical_device_;
+ }
+
+ private:
+ explicit TimePointFencePool(ref_ptr<VkDeviceHandle> logical_device);
+
+ const ref_ptr<DynamicSymbols>& syms() const;
+
+ Status PreallocateFences() ABSL_LOCKS_EXCLUDED(mutex_);
+
+ ref_ptr<VkDeviceHandle> logical_device_;
+
+ absl::Mutex mutex_;
+
+ IntrusiveList<TimePointFence> free_fences_ ABSL_GUARDED_BY(mutex_);
+};
+
+// A pool of `VkSemaphore`s that can be used by `EmulatedTimelineSemaphore` to
+// simulate individual timeline value signaling.
+class TimePointSemaphorePool final : public RefObject<TimePointSemaphorePool> {
+ public:
+ static constexpr int kMaxInFlightSemaphoreCount = 64;
+
+ // Creates a new pool and pre-allocates `kMaxInFlightSemaphoreCount` binary
+ // semaphores.
+ static StatusOr<ref_ptr<TimePointSemaphorePool>> Create(
+ ref_ptr<VkDeviceHandle> logical_device);
+
+ ~TimePointSemaphorePool();
+
+ // Acquires a binary semaphore from the pool for use by the caller. The
+ // semaphore is guaranteed to be in unsignaled state and not in-flight on GPU.
+ //
+ // Returns RESOURCE_EXHAUSTED if the pool has no more available semaphores.
+ // Callers are expected to handle this by waiting on previous fences or for
+ // complete device idle. Yes, that's as bad as it sounds, and if we start
+ // seeing that we should bump up the max count.
+ StatusOr<TimePointSemaphore*> Acquire();
+
+ // Releases one or more semaphores back to the pool. The binary semaphore must
+ // be unsignaled and not in flight on GPU.
+ void ReleaseResolved(IntrusiveList<TimePointSemaphore>* semaphores);
+
+ // Releases one or more semaphores back to the pool. These may be in any state
+ // and will be assumed as untouchable; the pool will unconditionally recycle
+ // them.
+ void ReleaseUnresolved(IntrusiveList<TimePointSemaphore>* semaphores);
+
+ private:
+ explicit TimePointSemaphorePool(ref_ptr<VkDeviceHandle> logical_device);
+
+ const ref_ptr<DynamicSymbols>& syms() const;
+
+ Status PreallocateSemaphores() ABSL_LOCKS_EXCLUDED(mutex_);
+
+ ref_ptr<VkDeviceHandle> logical_device_;
+
+ absl::Mutex mutex_;
+
+ std::array<TimePointSemaphore, kMaxInFlightSemaphoreCount> storage_
+ ABSL_GUARDED_BY(mutex_);
+ IntrusiveList<TimePointSemaphore> free_semaphores_ ABSL_GUARDED_BY(mutex_);
+};
+
+} // namespace vulkan
+} // namespace hal
+} // namespace iree
+
+#endif // IREE_HAL_VULKAN_TIMEPOINT_UTIL_H_
diff --git a/iree/hal/vulkan/vulkan_device.cc b/iree/hal/vulkan/vulkan_device.cc
index 3c7e37b..42b56c4 100644
--- a/iree/hal/vulkan/vulkan_device.cc
+++ b/iree/hal/vulkan/vulkan_device.cc
@@ -30,12 +30,14 @@
#include "iree/hal/vulkan/direct_command_buffer.h"
#include "iree/hal/vulkan/direct_command_queue.h"
#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/emulated_timeline_semaphore.h"
#include "iree/hal/vulkan/extensibility_util.h"
#include "iree/hal/vulkan/native_descriptor_set.h"
#include "iree/hal/vulkan/native_event.h"
#include "iree/hal/vulkan/native_timeline_semaphore.h"
#include "iree/hal/vulkan/pipeline_cache.h"
#include "iree/hal/vulkan/pipeline_executable_layout.h"
+#include "iree/hal/vulkan/serializing_command_queue.h"
#include "iree/hal/vulkan/status_util.h"
#include "iree/hal/vulkan/vma_allocator.h"
@@ -164,6 +166,7 @@
const DeviceInfo& device_info,
const ref_ptr<VkDeviceHandle>& logical_device,
const QueueSet& compute_queue_set, const QueueSet& transfer_queue_set,
+ const ref_ptr<TimePointFencePool>& fence_pool,
const ref_ptr<DynamicSymbols>& syms) {
absl::InlinedVector<std::unique_ptr<CommandQueue>, 4> command_queues;
@@ -175,10 +178,17 @@
syms->vkGetDeviceQueue(*logical_device,
compute_queue_set.queue_family_index, i, &queue);
std::string queue_name = absl::StrCat(device_info.name(), ":d", i);
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+ command_queues.push_back(absl::make_unique<SerializingCommandQueue>(
+ std::move(queue_name),
+ CommandCategory::kDispatch | CommandCategory::kTransfer, logical_device,
+ fence_pool, queue));
+#else
command_queues.push_back(absl::make_unique<DirectCommandQueue>(
std::move(queue_name),
CommandCategory::kDispatch | CommandCategory::kTransfer, logical_device,
queue));
+#endif // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
}
uint64_t transfer_queue_count = CountOnes64(transfer_queue_set.queue_indices);
@@ -189,9 +199,15 @@
syms->vkGetDeviceQueue(*logical_device,
transfer_queue_set.queue_family_index, i, &queue);
std::string queue_name = absl::StrCat(device_info.name(), ":t", i);
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+ command_queues.push_back(absl::make_unique<SerializingCommandQueue>(
+ std::move(queue_name), CommandCategory::kTransfer, logical_device,
+ fence_pool, queue));
+#else
command_queues.push_back(absl::make_unique<DirectCommandQueue>(
std::move(queue_name), CommandCategory::kTransfer, logical_device,
queue));
+#endif // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
}
return command_queues;
@@ -354,14 +370,27 @@
for (uint32_t i = 0; i < queue_family_info.transfer_queue_count; ++i) {
transfer_queue_set.queue_indices |= 1 << (i + base_queue_index);
}
- auto command_queues = CreateCommandQueues(
- device_info, logical_device, compute_queue_set, transfer_queue_set, syms);
+
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+ ASSIGN_OR_RETURN(auto semaphore_pool,
+ TimePointSemaphorePool::Create(add_ref(logical_device)));
+ ASSIGN_OR_RETURN(auto fence_pool,
+ TimePointFencePool::Create(add_ref(logical_device)));
+#else
+ ref_ptr<TimePointSemaphorePool> semaphore_pool = nullptr;
+ ref_ptr<TimePointFencePool> fence_pool = nullptr;
+#endif // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+
+ auto command_queues =
+ CreateCommandQueues(device_info, logical_device, compute_queue_set,
+ transfer_queue_set, fence_pool, syms);
return assign_ref(new VulkanDevice(
std::move(driver), device_info, physical_device,
std::move(logical_device), std::move(allocator),
std::move(command_queues), std::move(dispatch_command_pool),
- std::move(transfer_command_pool), debug_capture_manager));
+ std::move(transfer_command_pool), std::move(semaphore_pool),
+ std::move(fence_pool), debug_capture_manager));
}
// static
@@ -421,13 +450,25 @@
device_handle, transfer_queue_set.queue_family_index));
}
- auto command_queues = CreateCommandQueues(
- device_info, device_handle, compute_queue_set, transfer_queue_set, syms);
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+ ASSIGN_OR_RETURN(auto semaphore_pool,
+ TimePointSemaphorePool::Create(add_ref(device_handle)));
+ ASSIGN_OR_RETURN(auto fence_pool,
+ TimePointFencePool::Create(add_ref(device_handle)));
+#else
+ ref_ptr<TimePointSemaphorePool> semaphore_pool = nullptr;
+ ref_ptr<TimePointFencePool> fence_pool = nullptr;
+#endif // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+
+ auto command_queues =
+ CreateCommandQueues(device_info, device_handle, compute_queue_set,
+ transfer_queue_set, fence_pool, syms);
return assign_ref(new VulkanDevice(
std::move(driver), device_info, physical_device, std::move(device_handle),
std::move(allocator), std::move(command_queues),
std::move(dispatch_command_pool), std::move(transfer_command_pool),
+ std::move(semaphore_pool), std::move(fence_pool),
/*debug_capture_manager=*/nullptr));
}
@@ -438,6 +479,8 @@
absl::InlinedVector<std::unique_ptr<CommandQueue>, 4> command_queues,
ref_ptr<VkCommandPoolHandle> dispatch_command_pool,
ref_ptr<VkCommandPoolHandle> transfer_command_pool,
+ ref_ptr<TimePointSemaphorePool> semaphore_pool,
+ ref_ptr<TimePointFencePool> fence_pool,
DebugCaptureManager* debug_capture_manager)
: Device(device_info),
driver_(std::move(driver)),
@@ -449,6 +492,8 @@
make_ref<DescriptorPoolCache>(add_ref(logical_device_))),
dispatch_command_pool_(std::move(dispatch_command_pool)),
transfer_command_pool_(std::move(transfer_command_pool)),
+ semaphore_pool_(std::move(semaphore_pool)),
+ fence_pool_(std::move(fence_pool)),
debug_capture_manager_(debug_capture_manager) {
// Populate the queue lists based on queue capabilities.
for (auto& command_queue : command_queues_) {
@@ -650,8 +695,33 @@
StatusOr<ref_ptr<Semaphore>> VulkanDevice::CreateSemaphore(
uint64_t initial_value) {
IREE_TRACE_SCOPE0("VulkanDevice::CreateSemaphore");
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+ return EmulatedTimelineSemaphore::Create(
+ add_ref(logical_device_),
+ // Triggers necessary processing on all queues due to new values gotten
+ // signaled for the given timeline |semaphore|.
+ [this](Semaphore* /*semaphore*/) -> Status {
+ IREE_TRACE_SCOPE0("<lambda>::OnSemaphoreSignal");
+ for (const auto& queue : command_queues_) {
+ RETURN_IF_ERROR(static_cast<SerializingCommandQueue*>(queue.get())
+ ->AdvanceQueueSubmission());
+ }
+ return OkStatus();
+ },
+ // Triggers necessary processing on all queues due to failures for the
+ // given timeline |semaphore|.
+ [this](Semaphore* /*semaphore*/) {
+ IREE_TRACE_SCOPE0("<lambda>::OnSemaphoreFailure");
+ for (const auto& queue : command_queues_) {
+ static_cast<SerializingCommandQueue*>(queue.get())
+ ->AbortQueueSubmission();
+ }
+ },
+ add_ref(semaphore_pool_), initial_value);
+#else
return NativeTimelineSemaphore::Create(add_ref(logical_device_),
initial_value);
+#endif // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
}
Status VulkanDevice::WaitAllSemaphores(
@@ -672,6 +742,23 @@
VkSemaphoreWaitFlags wait_flags) {
IREE_TRACE_SCOPE0("VulkanDevice::WaitSemaphores");
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+
+ // TODO(antiagainst): We actually should get the fences associated with the
+ // emulated timeline semaphores so that we can wait them in a bunch. This
+ // implementation is problematic if we wait to wait any and we have the first
+ // semaphore taking extra long time but the following ones signal quickly.
+ for (int i = 0; i < semaphores.size(); ++i) {
+ auto* semaphore =
+ static_cast<EmulatedTimelineSemaphore*>(semaphores[i].semaphore);
+ RETURN_IF_ERROR(semaphore->Wait(semaphores[i].value, deadline));
+ if (wait_flags & VK_SEMAPHORE_WAIT_ANY_BIT) return OkStatus();
+ }
+
+ return OkStatus();
+
+#else
+
absl::InlinedVector<VkSemaphore, 4> semaphore_handles(semaphores.size());
absl::InlinedVector<uint64_t, 4> semaphore_values(semaphores.size());
for (int i = 0; i < semaphores.size(); ++i) {
@@ -714,6 +801,8 @@
// semaphores we waited on (including those already expired above).
return OkStatus();
+
+#endif // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
}
Status VulkanDevice::WaitIdle(absl::Time deadline) {
diff --git a/iree/hal/vulkan/vulkan_device.h b/iree/hal/vulkan/vulkan_device.h
index cfceb4f..ce7c9d7 100644
--- a/iree/hal/vulkan/vulkan_device.h
+++ b/iree/hal/vulkan/vulkan_device.h
@@ -30,6 +30,7 @@
#include "iree/hal/semaphore.h"
#include "iree/hal/vulkan/descriptor_pool_cache.h"
#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/emulated_timeline_semaphore.h"
#include "iree/hal/vulkan/extensibility_util.h"
#include "iree/hal/vulkan/handle_util.h"
@@ -119,6 +120,8 @@
absl::InlinedVector<std::unique_ptr<CommandQueue>, 4> command_queues,
ref_ptr<VkCommandPoolHandle> dispatch_command_pool,
ref_ptr<VkCommandPoolHandle> transfer_command_pool,
+ ref_ptr<TimePointSemaphorePool> semaphore_pool,
+ ref_ptr<TimePointFencePool> fence_pool,
DebugCaptureManager* debug_capture_manager);
Status WaitSemaphores(absl::Span<const SemaphoreValue> semaphores,
@@ -139,6 +142,10 @@
ref_ptr<VkCommandPoolHandle> dispatch_command_pool_;
ref_ptr<VkCommandPoolHandle> transfer_command_pool_;
+ // Fields used for emulated timeline semaphores.
+ ref_ptr<TimePointSemaphorePool> semaphore_pool_;
+ ref_ptr<TimePointFencePool> fence_pool_;
+
DebugCaptureManager* debug_capture_manager_ = nullptr;
};
diff --git a/iree/hal/vulkan/vulkan_driver_module.cc b/iree/hal/vulkan/vulkan_driver_module.cc
index 4b98f58..f034127 100644
--- a/iree/hal/vulkan/vulkan_driver_module.cc
+++ b/iree/hal/vulkan/vulkan_driver_module.cc
@@ -67,9 +67,11 @@
// promoted to core, so we list it as optional even though we require it.
options.instance_extensibility.optional_extensions.push_back(
VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES == 0
// Timeline semaphore support is required.
options.device_extensibility.required_extensions.push_back(
VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME);
+#endif
if (absl::GetFlag(FLAGS_vulkan_validation_layers)) {
options.instance_extensibility.optional_layers.push_back(