[vulkan] Support emulated timeline semaphores

This commit adds a iree::hal::Semaphore implementation for Vulkan
using binary semaphores and fences. It supports the functionality
of a timeline semaphore while guaranteeing the usage requirements
of binary semaphores and fences. In order to do that, we also need
a special iree::hal::CommandQueue implementation for Vulkan that
is able to defer releasing submissions to the GPU.

Progress on https://github.com/google/iree/issues/2002.

Closes https://github.com/google/iree/pull/2208

PiperOrigin-RevId: 318074032
diff --git a/iree/hal/vulkan/BUILD b/iree/hal/vulkan/BUILD
index 09130826..d4c5753 100644
--- a/iree/hal/vulkan/BUILD
+++ b/iree/hal/vulkan/BUILD
@@ -37,6 +37,8 @@
     },
 )
 
+# TODO(antiagainst): expose configuration for emulated timeline semaphore
+
 cc_library(
     name = "api",
     srcs = ["api.cc"],
@@ -188,6 +190,25 @@
 )
 
 cc_library(
+    name = "emulated_timeline_semaphore",
+    srcs = ["emulated_timeline_semaphore.cc"],
+    hdrs = ["emulated_timeline_semaphore.h"],
+    deps = [
+        ":handle_util",
+        ":status_util",
+        ":timepoint_util",
+        "//iree/base:intrusive_list",
+        "//iree/base:status",
+        "//iree/base:tracing",
+        "//iree/hal:semaphore",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@iree_vulkan_headers//:vulkan_headers_no_prototypes",
+    ],
+)
+
+cc_library(
     name = "extensibility_util",
     srcs = ["extensibility_util.cc"],
     hdrs = ["extensibility_util.h"],
@@ -327,6 +348,24 @@
 )
 
 cc_library(
+    name = "serializing_command_queue",
+    srcs = ["serializing_command_queue.cc"],
+    hdrs = ["serializing_command_queue.h"],
+    deps = [
+        ":direct_command_buffer",
+        ":emulated_timeline_semaphore",
+        ":handle_util",
+        ":status_util",
+        ":timepoint_util",
+        "//iree/base:status",
+        "//iree/base:tracing",
+        "//iree/hal:command_queue",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
     name = "status_util",
     srcs = ["status_util.cc"],
     hdrs = ["status_util.h"],
@@ -337,6 +376,21 @@
 )
 
 cc_library(
+    name = "timepoint_util",
+    srcs = ["timepoint_util.cc"],
+    hdrs = ["timepoint_util.h"],
+    deps = [
+        ":handle_util",
+        "//iree/base:intrusive_list",
+        "//iree/base:ref_ptr",
+        "//iree/base:status",
+        "//iree/base:tracing",
+        "@com_google_absl//absl/synchronization",
+        "@iree_vulkan_headers//:vulkan_headers_no_prototypes",
+    ],
+)
+
+cc_library(
     name = "vma_allocator",
     srcs = [
         "internal_vk_mem_alloc.cc",
@@ -385,6 +439,7 @@
         ":direct_command_buffer",
         ":direct_command_queue",
         ":dynamic_symbols",
+        ":emulated_timeline_semaphore",
         ":extensibility_util",
         ":handle_util",
         ":native_descriptor_set",
@@ -392,6 +447,7 @@
         ":native_timeline_semaphore",
         ":pipeline_cache",
         ":pipeline_executable_layout",
+        ":serializing_command_queue",
         ":status_util",
         ":vma_allocator",
         "//iree/base:math",
diff --git a/iree/hal/vulkan/CMakeLists.txt b/iree/hal/vulkan/CMakeLists.txt
index 0d2c52e..e534073 100644
--- a/iree/hal/vulkan/CMakeLists.txt
+++ b/iree/hal/vulkan/CMakeLists.txt
@@ -12,6 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# TODO(antiagainst): We should probably always compiling the emulation in and
+# probe at runtime to enable if the device does not support native timeline
+# semaphore.
+option(IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORE
+       "Emulates timeline semaphore with binary semaphores and fences" OFF)
+
+if(NOT IREE_HAL_DRIVER_VULKAN)
+  set(IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORE OFF CACHE BOOL "" FORCE)
+endif()
+
+if(IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORE)
+  set(IREE_EMULATE_TIMELINE_SEMAPHORE 1)
+else()
+  set(IREE_EMULATE_TIMELINE_SEMAPHORE 0)
+endif()
+
 set(VMA_SRC_ROOT
   "${IREE_ROOT_DIR}/third_party/vulkan_memory_allocator/src/"
 )
@@ -199,6 +215,30 @@
 
 iree_cc_library(
   NAME
+    emulated_timeline_semaphore
+  HDRS
+    "emulated_timeline_semaphore.h"
+  SRCS
+    "emulated_timeline_semaphore.cc"
+  COPTS
+    "-DVK_NO_PROTOTYPES"
+  DEPS
+    ::handle_util
+    ::status_util
+    ::timepoint_util
+    absl::inlined_vector
+    absl::synchronization
+    absl::time
+    iree::base::intrusive_list
+    iree::base::status
+    iree::base::tracing
+    iree::hal::semaphore
+    Vulkan::Headers
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
     extensibility_util
   HDRS
     "extensibility_util.h"
@@ -225,9 +265,11 @@
   COPTS
     "-DVK_NO_PROTOTYPES"
   DEPS
+    absl::inlined_vector
     absl::synchronization
     absl::utility
     iree::base::ref_ptr
+    iree::hal::command_queue
     iree::hal::vulkan::dynamic_symbols
     iree::hal::vulkan::extensibility_util
     Vulkan::Headers
@@ -376,6 +418,30 @@
 
 iree_cc_library(
   NAME
+    serializing_command_queue
+  HDRS
+    "serializing_command_queue.h"
+  SRCS
+    "serializing_command_queue.cc"
+  COPTS
+    "-DVK_NO_PROTOTYPES"
+  DEPS
+    ::direct_command_buffer
+    ::emulated_timeline_semaphore
+    ::handle_util
+    ::status_util
+    ::timepoint_util
+    absl::inlined_vector
+    absl::synchronization
+    iree::base::status
+    iree::base::tracing
+    iree::hal::command_queue
+    Vulkan::Headers
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
     status_util
   HDRS
     "status_util.h"
@@ -391,6 +457,26 @@
 
 iree_cc_library(
   NAME
+    timepoint_util
+  HDRS
+    "timepoint_util.h"
+  SRCS
+    "timepoint_util.cc"
+  COPTS
+    "-DVK_NO_PROTOTYPES"
+  DEPS
+    ::handle_util
+    absl::synchronization
+    iree::base::intrusive_list
+    iree::base::ref_ptr
+    iree::base::status
+    iree::base::tracing
+    Vulkan::Headers
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
     vma_allocator
   HDRS
     "vma_allocator.h"
@@ -431,18 +517,21 @@
   SRCS
     "vulkan_device.cc"
   COPTS
+    "-DIREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES=${IREE_EMULATE_TIMELINE_SEMAPHORE}"
     "-DVK_NO_PROTOTYPES"
   DEPS
     ::descriptor_pool_cache
     ::direct_command_buffer
     ::direct_command_queue
     ::dynamic_symbols
+    ::emulated_timeline_semaphore
     ::extensibility_util
     ::handle_util
     ::native_descriptor_set
     ::native_timeline_semaphore
     ::pipeline_cache
     ::pipeline_executable_layout
+    ::serializing_command_queue
     ::status_util
     ::vma_allocator
     absl::inlined_vector
@@ -498,6 +587,7 @@
   SRCS
     "vulkan_driver_module.cc"
   COPTS
+    "-DIREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES=${IREE_EMULATE_TIMELINE_SEMAPHORE}"
     "-DVK_NO_PROTOTYPES"
   DEPS
     absl::flags
diff --git a/iree/hal/vulkan/emulated_timeline_semaphore.cc b/iree/hal/vulkan/emulated_timeline_semaphore.cc
new file mode 100644
index 0000000..9656310
--- /dev/null
+++ b/iree/hal/vulkan/emulated_timeline_semaphore.cc
@@ -0,0 +1,322 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/hal/vulkan/emulated_timeline_semaphore.h"
+
+#include "absl/container/inlined_vector.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "absl/utility/utility.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/status_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// static
+StatusOr<ref_ptr<Semaphore>> EmulatedTimelineSemaphore::Create(
+    ref_ptr<VkDeviceHandle> logical_device,
+    std::function<Status(Semaphore*)> on_signal,
+    std::function<void(Semaphore*)> on_failure,
+    ref_ptr<TimePointSemaphorePool> semaphore_pool, uint64_t initial_value) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Create");
+  return make_ref<EmulatedTimelineSemaphore>(
+      std::move(logical_device), std::move(on_signal), std::move(on_failure),
+      std::move(semaphore_pool), initial_value);
+}
+
+EmulatedTimelineSemaphore::EmulatedTimelineSemaphore(
+    ref_ptr<VkDeviceHandle> logical_device,
+    std::function<Status(Semaphore*)> on_signal,
+    std::function<void(Semaphore*)> on_failure,
+    ref_ptr<TimePointSemaphorePool> semaphore_pool, uint64_t initial_value)
+    : signaled_value_(initial_value),
+      logical_device_(std::move(logical_device)),
+      on_signal_(std::move(on_signal)),
+      on_failure_(std::move(on_failure)),
+      semaphore_pool_(std::move(semaphore_pool)) {}
+
+EmulatedTimelineSemaphore::~EmulatedTimelineSemaphore() {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::dtor");
+  CHECK_OK(TryToAdvanceTimeline(UINT64_MAX).status());
+  absl::MutexLock lock(&mutex_);
+  CHECK(outstanding_semaphores_.empty())
+      << "Destroying an emulated timeline semaphore without first waiting on "
+         "outstanding signals";
+}
+
+StatusOr<uint64_t> EmulatedTimelineSemaphore::Query() {
+  RETURN_IF_ERROR(TryToAdvanceTimeline(UINT64_MAX).status());
+  uint64_t value = signaled_value_.load();
+  if (value == UINT64_MAX) {
+    absl::MutexLock lock(&mutex_);
+    return status_;
+  }
+  return value;
+}
+
+Status EmulatedTimelineSemaphore::Signal(uint64_t value) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Signal");
+  auto signaled_value = signaled_value_.exchange(value);
+  // Make sure the previous signaled value is smaller than the new value.
+  CHECK(signaled_value < value)
+      << "Attempting to signal a timeline value out of order; trying " << value
+      << " but " << signaled_value << " already signaled";
+
+  // Inform the device to make progress given we have a new value signaled now.
+  RETURN_IF_ERROR(on_signal_(this));
+
+  return OkStatus();
+}
+
+Status EmulatedTimelineSemaphore::Wait(uint64_t value, absl::Time deadline) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Wait");
+
+  VkFence fence = VK_NULL_HANDLE;
+  do {
+    IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Wait#loop");
+    // First try to advance the timeline without blocking to see whether we've
+    // already reached the desired value.
+    ASSIGN_OR_RETURN(bool reached_desired_value, TryToAdvanceTimeline(value));
+    if (reached_desired_value) return OkStatus();
+
+    // We must wait now. Find the first emulated time point that has a value >=
+    // the desired value so we can wait on its associated signal fence to make
+    // sure the timeline is advanced to the desired value.
+    absl::MutexLock lock(&mutex_);
+    auto semaphore = outstanding_semaphores_.begin();
+    for (; semaphore != outstanding_semaphores_.end(); ++semaphore) {
+      if ((*semaphore)->value >= value) break;
+    }
+    if (semaphore != outstanding_semaphores_.end()) {
+      if (!(*semaphore)->signal_fence) {
+        return InternalErrorBuilder(IREE_LOC)
+               << "Timeline should have a signal fence for the first time "
+                  "point beyond the signaled value";
+      }
+      fence = (*semaphore)->signal_fence->value();
+      // Found; we can break the loop and proceed to waiting now.
+      break;
+    }
+    // TODO(antiagainst): figure out a better way instead of the busy loop here.
+  } while (absl::Now() < deadline);
+
+  if (fence == VK_NULL_HANDLE) {
+    return DeadlineExceededErrorBuilder(IREE_LOC)
+           << "Deadline reached when waiting timeline semaphore";
+  }
+
+  uint64_t timeout_nanos;
+  if (deadline == absl::InfiniteFuture()) {
+    timeout_nanos = UINT64_MAX;
+  } else if (deadline == absl::InfinitePast()) {
+    timeout_nanos = 0;
+  } else {
+    auto relative_nanos = absl::ToInt64Nanoseconds(deadline - absl::Now());
+    timeout_nanos = relative_nanos < 0 ? 0 : relative_nanos;
+  }
+
+  VK_RETURN_IF_ERROR(logical_device_->syms()->vkWaitForFences(
+      *logical_device_, /*fenceCount=*/1, &fence, /*waitAll=*/true,
+      timeout_nanos));
+
+  RETURN_IF_ERROR(TryToAdvanceTimeline(value).status());
+  return OkStatus();
+}
+
+void EmulatedTimelineSemaphore::Fail(Status status) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::Fail");
+  absl::MutexLock lock(&mutex_);
+  status_ = std::move(status);
+  signaled_value_.store(UINT64_MAX);
+}
+
+VkSemaphore EmulatedTimelineSemaphore::GetWaitSemaphore(
+    uint64_t value, const ref_ptr<TimePointFence>& wait_fence) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::GetWaitSemaphore");
+  absl::MutexLock lock(&mutex_);
+
+  VkSemaphore semaphore = VK_NULL_HANDLE;
+  for (TimePointSemaphore* point : outstanding_semaphores_) {
+    if (point->value > value && point->wait_fence) {
+      point->wait_fence = add_ref(wait_fence);
+      semaphore = point->semaphore;
+      break;
+    }
+  }
+
+  return semaphore;
+}
+
+Status EmulatedTimelineSemaphore::CancelWaitSemaphore(VkSemaphore semaphore) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::CancelWaitSemaphore");
+  absl::MutexLock lock(&mutex_);
+  for (TimePointSemaphore* point : outstanding_semaphores_) {
+    if (point->semaphore != semaphore) continue;
+
+    if (!point->wait_fence) {
+      return InvalidArgumentErrorBuilder(IREE_LOC)
+             << "Time point wasn't waited before";
+    }
+    point->wait_fence = nullptr;
+    return OkStatus();
+  }
+  return InvalidArgumentErrorBuilder(IREE_LOC)
+         << "No time point for the given semaphore";
+}
+
+StatusOr<VkSemaphore> EmulatedTimelineSemaphore::GetSignalSemaphore(
+    uint64_t value, const ref_ptr<TimePointFence>& signal_fence) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::GetSignalSemaphore");
+
+  if (signaled_value_.load() >= value) {
+    return FailedPreconditionErrorBuilder(IREE_LOC)
+           << "Timeline semaphore already signaled past " << value;
+  }
+
+  absl::MutexLock lock(&mutex_);
+
+  auto insertion_point = outstanding_semaphores_.begin();
+  while (insertion_point != outstanding_semaphores_.end()) {
+    if ((*insertion_point)->value > value) break;
+  }
+
+  ASSIGN_OR_RETURN(TimePointSemaphore * semaphore, semaphore_pool_->Acquire());
+  semaphore->value = value;
+  semaphore->signal_fence = add_ref(signal_fence);
+  if (semaphore->wait_fence) {
+    return InternalErrorBuilder(IREE_LOC)
+           << "Newly acquired time point semaphore should not have waiters";
+  }
+  outstanding_semaphores_.insert(insertion_point, semaphore);
+
+  return semaphore->semaphore;
+}
+
+StatusOr<bool> EmulatedTimelineSemaphore::TryToAdvanceTimeline(
+    uint64_t to_upper_value) {
+  IREE_TRACE_SCOPE0("EmulatedTimelineSemaphore::TryToAdvanceTimeline");
+
+  // We hold the lock during the entire resolve process so that we can resolve
+  // to the furthest possible value.
+  absl::MutexLock lock(&mutex_);
+
+  uint64_t past_value = signaled_value_.load();
+
+  // Fast path for when already signaled past the desired value.
+  if (past_value >= to_upper_value) return true;
+
+  // The timeline has not signaled past the desired value and there is no
+  // binary semaphore pending on GPU yet: certainly the timeline cannot
+  // advance to the desired value.
+  if (outstanding_semaphores_.empty()) return false;
+
+  IntrusiveList<TimePointSemaphore> resolved_semaphores;
+
+  bool keep_resolving = true;
+  bool reached_desired_value = false;
+  while (keep_resolving && !outstanding_semaphores_.empty()) {
+    auto* semaphore = outstanding_semaphores_.front();
+
+    // If the current semaphore is for a value beyond our upper limit, then
+    // early exit so that we don't spend time dealing with signals we don't yet
+    // care about. This can prevent live lock where one thread is signaling
+    // fences as fast/faster than another thread can consume them.
+    if (semaphore->value > to_upper_value) {
+      keep_resolving = false;
+      reached_desired_value = true;
+      break;
+    }
+
+    // If the current semaphore is for a value not greater than the past
+    // signaled value, then we know it was signaled previously. But there might
+    // be a waiter on it on GPU.
+    if (semaphore->value <= past_value) {
+      if (semaphore->signal_fence) {
+        return InternalErrorBuilder(IREE_LOC)
+               << "Timeline should already signaled past this time point and "
+                  "cleared the signal fence";
+      }
+
+      // If ther is no waiters, we can recycle this semaphore now. If there
+      // exists one waiter, then query its status and recycle on success. We
+      // only handle success status here. Others will be handled when the fence
+      // is checked for other semaphores' signaling status for the same queue
+      // submission.
+      if (!semaphore->wait_fence ||
+          semaphore->wait_fence->GetStatus() == VK_SUCCESS) {
+        semaphore->signal_fence = nullptr;
+        semaphore->wait_fence = nullptr;
+        outstanding_semaphores_.erase(semaphore);
+        resolved_semaphores.push_back(semaphore);
+      }
+
+      continue;
+    }
+
+    // This semaphore represents a value gerater than the known previously
+    // signaled value. We don't know its status so we need to really query now.
+
+    if (!semaphore->signal_fence) {
+      return InternalErrorBuilder(IREE_LOC)
+             << "The status of this time point in the timeline should still be "
+                "pending with a singal fence";
+    }
+    VkResult signal_status = semaphore->signal_fence->GetStatus();
+
+    switch (signal_status) {
+      case VK_SUCCESS:
+        signaled_value_.store(semaphore->value);
+        semaphore->signal_fence = nullptr;
+        // If no waiters, we can recycle this semaphore now.
+        if (!semaphore->wait_fence) {
+          semaphore->signal_fence = nullptr;
+          semaphore->wait_fence = nullptr;
+          outstanding_semaphores_.erase(semaphore);
+          resolved_semaphores.push_back(semaphore);
+        }
+        break;
+      case VK_NOT_READY:
+        // The fence has not been signaled yet so this is the furthest time
+        // point we can go in this timeline.
+        keep_resolving = false;
+        break;
+      default:
+        // Fence indicates an error (device lost, out of memory, etc).
+        // Propagate this back to our status (and thus any waiters).
+        // Since we only take the first error we find we skip all remaining
+        // fences.
+        keep_resolving = false;
+        semaphore->signal_fence = nullptr;
+        status_ = VkResultToStatus(signal_status);
+        signaled_value_.store(UINT64_MAX);
+        break;
+    }
+  }
+
+  semaphore_pool_->ReleaseResolved(&resolved_semaphores);
+  if (!status_.ok()) {
+    on_failure_(this);
+    semaphore_pool_->ReleaseUnresolved(&outstanding_semaphores_);
+    return status_;
+  }
+
+  return reached_desired_value;
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
diff --git a/iree/hal/vulkan/emulated_timeline_semaphore.h b/iree/hal/vulkan/emulated_timeline_semaphore.h
new file mode 100644
index 0000000..cc13a09
--- /dev/null
+++ b/iree/hal/vulkan/emulated_timeline_semaphore.h
@@ -0,0 +1,223 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_VULKAN_ENUMLATED_TIMELINE_SEMAPHORE_H_
+#define IREE_HAL_VULKAN_ENUMLATED_TIMELINE_SEMAPHORE_H_
+
+#include <vulkan/vulkan.h>
+
+#include <atomic>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "iree/base/intrusive_list.h"
+#include "iree/base/ref_ptr.h"
+#include "iree/base/status.h"
+#include "iree/hal/semaphore.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/timepoint_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// A timeline semaphore emulated via `VkFence`s and binary `VkSemaphore`s.
+//
+// Vulkan provides several explicit synchronization primitives: fences,
+// (binary/timeline) semaphores, events, pipeline barriers, and render passes.
+// See "6. Synchronization and Cache Control" of the Vulkan specification
+// for the details.
+//
+// Render passes are for graphics pipelines so IREE does not care about them.
+// Pipeline barriers synchronize control within a command buffer at a single
+// point. Fences, (binary/timeline) semaphores, and events are synchronization
+// primitives that have separate signal and wait operations. Events are more
+// fine-grained compared to fences and semaphores given that they can be
+// signaled or waited within a command buffer while fences and semaphores are
+// at queue submissions. Each of them have its usage requirements:
+//
+// * Fences must be signaled on GPU and waited on CPU. Fences must be reset
+//   before reuse.
+// * Binary semaphores must be signaled on GPU and waited on GPU. They do not
+//   support wait-before-signal submission order. More importantly, binary
+//   semaphore wait also unsignals the semaphore. So binary semaphore signals
+//   and waits should occur in discrete 1:1 pairs.
+// * Timeline semaphores can be signaled on CPU or GPU and waited on CPU or GPU.
+//   They support wait-before-signal submission order. Timeline semaphores do
+//   not need to be reset.
+//
+// It's clear that timeline semaphore is more flexible than fences and binary
+// semaphores: it unifies GPU and CPU synchronization with a single primitive.
+// But it's not always available: it requires the VK_KHR_timeline_semaphore
+// or Vulkan 1.2. When it's not available, it can be emulated via `VkFence`s
+// and binary `VkSemaphore`s. The emulation need to provide the functionality of
+// timeline semaphores and also not violate the usage requirements of `VkFence`s
+// and binary `VkSemaphore`s.
+//
+// The basic idea is to create a timeline object with time points to emulate the
+// timeline semaphore, which consists of a monotonically increasing 64-bit
+// integer value. Each time point represents a specific signaled/waited integer
+// value of the timeline semaphore; each time point can associate with binary
+// `VkSemaphore`s and/or `VkFence`s for emulating the synchronization.
+//
+// Concretely, for each of the possible signal -> wait scenarios timeline
+// semaphore supports:
+//
+// ### GPU -> GPU (via `vkQueueSubmit`)
+//
+// Each `vkQueueSubmit` can attach a `VkTimelineSemaphoreSubmitInfo` to describe
+// the timeline semaphore values signaled and waited. Each of the signaled value
+// will be a time point and emulated by a binary `VkSemaphore`. We submit the
+// binary `VkSemahpore`s to the GPU under the hood. For the waited values, the
+// situation is more complicated because of the differences between binary and
+// timeline semaphores:
+//
+// * Binary semaphore signal-wait relationship is strictly 1:1, unlike timeline
+//   semaphore where we can have 1:N cases. This means for a specific binary
+//   `VkSemaphore` used to emulate a signaled time point, we can have at most
+//   one subsequent `vkQueueSubmit` waits on it. We need other mechanisms for
+//   additional waits. A simple way is to involve the CPU and don't sumbit
+//   the additional work to queue until the desired value is already signaled
+//   past. This requires `VkFence`s for letting the CPU know the status of
+//   GPU progress, but `VkFence` is needed anyway because of GPU -> CPU
+//   synchronization.
+// * Binary semaphores does not support wait-before-signal submission order.
+//   This means we need to put the submission into a self-managed queue if the
+//   binary semaphores used to emulate the time points waited by the submission
+//   are not submitted to GPU yet.
+//
+// ### GPU -> CPU (via `vkWaitSemaphores`)
+//
+// Without timeline semaphore, we need to use fences to let CPU wait on GPU
+// progress. So this direction can be emulated by `vkWaitFences`. It means we
+// need to associate a `VkFence` with the given waited timeline semaphores.
+// Because we don't know whether a particular `vkQueueSubmit` with timeline
+// semaphores will be later waited on by CPU beforehand, we need to bundle each
+// of them with a `VkFence` just in case they will be waited on later.
+//
+// ### CPU -> GPU (via `vkSignalSemaphore`)
+//
+// This direction can be handled by bumping the signaled timeline value and
+// scan the self-managed queue to submit more work to GPU if possible.
+//
+// ### CPU -> CPU (via `vkWaitSemaphores`)
+//
+// This is similar to CPU -> GPU direction; we just need to enable other threads
+// on CPU side and let them progress.
+//
+// The implementation is inspired by the Vulkan-ExtensionLayer project:
+// https://github.com/KhronosGroup/Vulkan-ExtensionLayer. We don't handle all
+// the aspects of the full spec though given that IREE only uses a subset of
+// synchronization primitives. So this should not be treated as a full
+// emulation of the Vulkan spec and thus does not substitute
+// Vulkan-ExtensionLayer.
+class EmulatedTimelineSemaphore final : public Semaphore {
+ public:
+  // Creates a timeline semaphore with the given |initial_value|.
+  static StatusOr<ref_ptr<Semaphore>> Create(
+      ref_ptr<VkDeviceHandle> logical_device,
+      std::function<Status(Semaphore*)> on_signal,
+      std::function<void(Semaphore*)> on_failure,
+      ref_ptr<TimePointSemaphorePool> semaphore_pool, uint64_t initial_value);
+
+  EmulatedTimelineSemaphore(ref_ptr<VkDeviceHandle> logical_device,
+                            std::function<Status(Semaphore*)> on_signal,
+                            std::function<void(Semaphore*)> on_failure,
+                            ref_ptr<TimePointSemaphorePool> semaphore_pool,
+                            uint64_t initialValue);
+
+  ~EmulatedTimelineSemaphore() override;
+
+  StatusOr<uint64_t> Query() override;
+
+  Status Signal(uint64_t value) override;
+
+  Status Wait(uint64_t value, absl::Time deadline) override;
+
+  void Fail(Status status) override;
+
+  // Gets a binary semaphore for waiting on the timeline to advance to the given
+  // |value|. The semaphore returned won't be waited by anyone else. Returns
+  // VK_NULL_HANDLE if no available semaphores for the given |value|.
+  // |wait_fence| is the fence associated with the queue submission that waiting
+  // on this semaphore.
+  VkSemaphore GetWaitSemaphore(uint64_t value,
+                               const ref_ptr<TimePointFence>& wait_fence);
+
+  // Cancels the waiting attempt on the given binary |semaphore|. This allows
+  // the |semaphore| to be waited by others.
+  Status CancelWaitSemaphore(VkSemaphore semaphore);
+
+  // Gets a binary semaphore for signaling the timeline to the given |value|.
+  // |value| must be smaller than the current timeline value. |signal_fence| is
+  // the fence associated with the queue submission that signals this semaphore.
+  StatusOr<VkSemaphore> GetSignalSemaphore(
+      uint64_t value, const ref_ptr<TimePointFence>& signal_fence);
+
+ private:
+  // Tries to advance the timeline to the given |to_upper_value| without
+  // blocking and returns whether the |to_upper_value| is reached.
+  StatusOr<bool> TryToAdvanceTimeline(uint64_t to_upper_value)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+  std::atomic<uint64_t> signaled_value_;
+
+  ref_ptr<VkDeviceHandle> logical_device_;
+
+  // Callback to inform that this timeline semaphore has signaled a new value.
+  std::function<Status(Semaphore*)> on_signal_;
+
+  // Callback to inform that this timeline semaphore has encountered a failure.
+  std::function<void(Semaphore*)> on_failure_;
+
+  ref_ptr<TimePointSemaphorePool> semaphore_pool_;
+
+  mutable absl::Mutex mutex_;
+
+  // A list of outstanding semaphores used to emulate time points.
+  //
+  // The life time of each semaphore is in one of the following state:
+  //
+  // * Unused state: value = UINT64_MAX, signal/wait fence = nullptr. This is
+  //   the state of the semaphore when it's initially acquired from the pool and
+  //   not put in the queue for emulating a time point yet.
+  // * Pending state: signaled value < value < UINT64_MAX, signal fence =
+  //   <some-fence>, wait fence == nullptr. This is the state of the semaphore
+  //   when it's put into the GPU queue for emulating a time point.
+  // * Pending and waiting state: signaled value < value < UINT64_MAX, signal
+  //   fence = <some-fence>, wait fence == <some-fence>. This is the state of
+  //   the semaphore when it's put into the GPU queue for emulating a time
+  //   point and there is another queue submission waiting on it in GPU.
+  // * Signaled and not ever waited state: value <= signaled value, singal/wait
+  //   fence = nullptr. This is the state of the semaphore when we know it's
+  //   already signaled on GPU and there is no waiters for it.
+  // * Signaled and waiting state: value <= signaled value, signal fence =
+  //   nullptr, wait fence = <some-fence>. This is the state of the semaphore
+  //   when we know it's already signaled on GPU and there is still one queue
+  //   submission on GPU is waiting for it.
+  IntrusiveList<TimePointSemaphore> outstanding_semaphores_
+      ABSL_GUARDED_BY(mutex_);
+
+  // NOTE: We only need to access this status (and thus take the lock) when we
+  // want to either signal failure or query the status in the case of the
+  // semaphore being set to UINT64_MAX.
+  Status status_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_ENUMLATED_TIMELINE_SEMAPHORE_H_
diff --git a/iree/hal/vulkan/serializing_command_queue.cc b/iree/hal/vulkan/serializing_command_queue.cc
new file mode 100644
index 0000000..9d6d24c
--- /dev/null
+++ b/iree/hal/vulkan/serializing_command_queue.cc
@@ -0,0 +1,355 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/hal/vulkan/serializing_command_queue.h"
+
+#include <memory>
+
+#include "absl/time/clock.h"
+#include "absl/types/span.h"
+#include "iree/base/memory.h"
+#include "iree/base/source_location.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/command_buffer.h"
+#include "iree/hal/command_queue.h"
+#include "iree/hal/semaphore.h"
+#include "iree/hal/vulkan/direct_command_buffer.h"
+#include "iree/hal/vulkan/emulated_timeline_semaphore.h"
+#include "iree/hal/vulkan/status_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+namespace {
+
+// Tries to prepare all necessary binary `VKSemaphore`s for emulating the time
+// points as specified in the given submission |batch| and returns true if
+// possible so that the |batch| is ready to be submitted to GPU.
+// |wait_semaphores| and |signal_semaphores| will be filled with the binary
+// `VkSemaphores` on success. |fence| is the fence associated with the
+// submission |batch|.
+StatusOr<bool> TryToPrepareSemaphores(
+    const SubmissionBatch& batch, const ref_ptr<TimePointFence>& fence,
+    absl::InlinedVector<VkSemaphore, 4>* wait_semaphores,
+    absl::InlinedVector<VkSemaphore, 4>* signal_semaphores) {
+  IREE_TRACE_SCOPE0("TryToPrepareSemaphores");
+
+  wait_semaphores->clear();
+  for (const auto& timeline_semaphore : batch.wait_semaphores) {
+    // Query first to progress this timeline semaphore to the furthest.
+    ASSIGN_OR_RETURN(auto signaled_value,
+                     timeline_semaphore.semaphore->Query());
+
+    // If it's already signaled to a value greater than we require here,
+    // we can just ignore this semaphore now.
+    if (signaled_value >= timeline_semaphore.value) continue;
+
+    // SerializingCommandQueue only works with EmulatedTimelineSemaphore.
+    auto* emulated_semaphore =
+        static_cast<EmulatedTimelineSemaphore*>(timeline_semaphore.semaphore);
+
+    // Otherwise try to get a binary semaphore for this time point so that
+    // we can wait on.
+    VkSemaphore binary_semaphore =
+        emulated_semaphore->GetWaitSemaphore(timeline_semaphore.value, fence);
+
+    if (binary_semaphore == VK_NULL_HANDLE) {
+      // We cannot wait on this time point yet: there are no previous semaphores
+      // submitted to the GPU  that can signal a value greater than what's
+      // desired here.
+
+      // Cancel the wait so others may make progress.
+      for (VkSemaphore semaphore : *wait_semaphores) {
+        RETURN_IF_ERROR(emulated_semaphore->CancelWaitSemaphore(semaphore));
+      }
+
+      // This batch cannot be submitted to GPU yet.
+      return false;
+    }
+
+    wait_semaphores->push_back(binary_semaphore);
+  }
+
+  // We've collected all necessary binary semaphores for each timeline we need
+  // to wait on. Now prepare binary semaphores for signaling.
+  signal_semaphores->clear();
+  for (const auto& timeline_semaphore : batch.signal_semaphores) {
+    // SerializingCommandQueue only works with EmulatedTimelineSemaphore.
+    auto* emulated_semaphore =
+        static_cast<EmulatedTimelineSemaphore*>(timeline_semaphore.semaphore);
+
+    ASSIGN_OR_RETURN(auto binary_semaphore,
+                     emulated_semaphore->GetSignalSemaphore(
+                         timeline_semaphore.value, fence));
+    signal_semaphores->push_back(binary_semaphore);
+  }
+
+  // Good to submit!
+  return true;
+}
+
+// Prepares `VkSubmitInfo` to submit the given list of |command_buffers| that
+// waiting on |wait_semaphores| and signalling |signal_semaphores|. Necessary
+// structures are allocated from |arena| and the result `VkSubmitInfo` is
+// written to |submit_info|.
+void PrepareSubmitInfo(
+    const absl::InlinedVector<VkSemaphore, 4>& wait_semaphores,
+    absl::Span<CommandBuffer* const> command_buffers,
+    const absl::InlinedVector<VkSemaphore, 4>& signal_semaphores,
+    VkSubmitInfo* submit_info, Arena* arena) {
+  IREE_TRACE_SCOPE0("PrepareSubmitInfo");
+
+  // TODO(benvanik): see if we can go to finer-grained stages.
+  // For example, if this was just queue ownership transfers then we can use
+  // the pseudo-stage of VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT.
+  VkPipelineStageFlags dst_stage_mask =
+      VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+  auto wait_semaphore_handles =
+      arena->AllocateSpan<VkSemaphore>(wait_semaphores.size());
+  auto wait_dst_stage_masks =
+      arena->AllocateSpan<VkPipelineStageFlags>(wait_semaphores.size());
+  for (int i = 0, e = wait_semaphores.size(); i < e; ++i) {
+    wait_semaphore_handles[i] = wait_semaphores[i];
+    wait_dst_stage_masks[i] = dst_stage_mask;
+  }
+
+  auto signal_semaphore_handles =
+      arena->AllocateSpan<VkSemaphore>(signal_semaphores.size());
+  for (int i = 0, e = signal_semaphores.size(); i < e; ++i) {
+    signal_semaphore_handles[i] = signal_semaphores[i];
+  }
+
+  auto command_buffer_handles =
+      arena->AllocateSpan<VkCommandBuffer>(command_buffers.size());
+  for (int i = 0, e = command_buffers.size(); i < e; ++i) {
+    const auto& command_buffer = command_buffers[i];
+    auto* direct_command_buffer =
+        static_cast<DirectCommandBuffer*>(command_buffer->impl());
+    command_buffer_handles[i] = direct_command_buffer->handle();
+  }
+
+  submit_info->sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+  submit_info->pNext = nullptr;
+  submit_info->waitSemaphoreCount = wait_semaphore_handles.size();
+  submit_info->pWaitSemaphores = wait_semaphore_handles.data();
+  submit_info->pWaitDstStageMask = wait_dst_stage_masks.data();
+  submit_info->commandBufferCount = command_buffer_handles.size();
+  submit_info->pCommandBuffers = command_buffer_handles.data();
+  submit_info->signalSemaphoreCount = signal_semaphore_handles.size();
+  submit_info->pSignalSemaphores = signal_semaphore_handles.data();
+}
+
+}  // namespace
+
+SerializingCommandQueue::SerializingCommandQueue(
+    std::string name, CommandCategoryBitfield supported_categories,
+    const ref_ptr<VkDeviceHandle>& logical_device,
+    const ref_ptr<TimePointFencePool>& fence_pool, VkQueue queue)
+    : CommandQueue(std::move(name), supported_categories),
+      logical_device_(add_ref(logical_device)),
+      fence_pool_(add_ref(fence_pool)),
+      queue_(queue) {}
+
+SerializingCommandQueue::~SerializingCommandQueue() {
+  IREE_TRACE_SCOPE0("SerializingCommandQueue::dtor");
+  absl::MutexLock lock(&mutex_);
+  syms()->vkQueueWaitIdle(queue_);
+}
+
+Status SerializingCommandQueue::Submit(
+    absl::Span<const SubmissionBatch> batches) {
+  IREE_TRACE_SCOPE0("SerializingCommandQueue::Submit");
+
+  absl::MutexLock lock(&mutex_);
+  for (const auto& batch : batches) {
+    // Grab a fence for this submission first. This will be used to check the
+    // progress of emulated timeline semaphores later.
+    ASSIGN_OR_RETURN(auto fence, fence_pool_->Acquire());
+    deferred_submissions_.push_back(
+        std::make_unique<FencedSubmission>(batch, std::move(fence)));
+  }
+
+  return ProcessDeferredSubmissions().status();
+}
+
+StatusOr<bool> SerializingCommandQueue::ProcessDeferredSubmissions() {
+  IREE_TRACE_SCOPE0("SerializingCommandQueue::ProcessDeferredSubmissions");
+
+  // Prepare `VkSubmitInfo`s for all submissions we are able to submit.
+
+  // Note that we must keep all arrays referenced alive until submission
+  // completes and since there are a bunch of them we use an arena.
+  Arena arena(4 * 1024);
+
+  absl::InlinedVector<VkSubmitInfo, 4> submit_infos;
+  absl::InlinedVector<VkFence, 4> submit_fences;
+
+  absl::InlinedVector<VkSemaphore, 4> wait_semaphores;
+  absl::InlinedVector<VkSemaphore, 4> signal_semaphores;
+
+  // A list of submissions that still needs to be deferred.
+  IntrusiveList<std::unique_ptr<FencedSubmission>> remaining_submissions;
+
+  while (!deferred_submissions_.empty()) {
+    wait_semaphores.clear();
+    signal_semaphores.clear();
+
+    auto submission = deferred_submissions_.take(deferred_submissions_.front());
+    const SubmissionBatch& batch = submission->batch;
+    ref_ptr<TimePointFence> fence(std::move(submission->fence));
+
+    ASSIGN_OR_RETURN(bool ready_to_submit,
+                     TryToPrepareSemaphores(batch, fence, &wait_semaphores,
+                                            &signal_semaphores));
+
+    if (ready_to_submit) {
+      submit_infos.emplace_back();
+      PrepareSubmitInfo(wait_semaphores, batch.command_buffers,
+                        signal_semaphores, &submit_infos.back(), &arena);
+      submit_fences.push_back(fence->value());
+      pending_fences_.emplace_back(std::move(fence));
+    } else {
+      // We need to defer the submission until later.
+      remaining_submissions.push_back(std::move(submission));
+    }
+  }
+
+  if (submit_infos.empty()) return false;
+
+  auto infos = arena.AllocateSpan<VkSubmitInfo>(submit_infos.size());
+  for (int i = 0, e = submit_infos.size(); i < e; ++i) {
+    infos[i] = submit_infos[i];
+  }
+
+  // Note: We might be able to batch the submission but it involves non-trivial
+  // fence handling. We can handle that if really needed.
+  for (int i = 0, e = submit_infos.size(); i < e; ++i) {
+    VK_RETURN_IF_ERROR(syms()->vkQueueSubmit(
+        queue_, /*submitCount=*/1, &submit_infos[i], submit_fences[i]));
+  }
+
+  while (!remaining_submissions.empty()) {
+    deferred_submissions_.push_back(
+        remaining_submissions.take(remaining_submissions.front()));
+  }
+
+  return true;
+}
+
+Status SerializingCommandQueue::WaitIdle(absl::Time deadline) {
+  absl::MutexLock lock(&mutex_);
+
+  if (deadline == absl::InfiniteFuture()) {
+    IREE_TRACE_SCOPE0("SerializingCommandQueue::WaitIdle#vkQueueWaitIdle");
+    // Fast path for using vkQueueWaitIdle, which is usually cheaper (as it
+    // requires fewer calls into the driver).
+
+    // Complete all pending work on the queue.
+    VK_RETURN_IF_ERROR(syms()->vkQueueWaitIdle(queue_));
+    pending_fences_.clear();
+
+    // Submit and complete all deferred work.
+    while (!deferred_submissions_.empty()) {
+      ASSIGN_OR_RETURN(bool work_submitted, ProcessDeferredSubmissions());
+      if (work_submitted) {
+        VK_RETURN_IF_ERROR(syms()->vkQueueWaitIdle(queue_));
+        pending_fences_.clear();
+      }
+    }
+
+    return OkStatus();
+  }
+
+  IREE_TRACE_SCOPE0("SerializingCommandQueue::WaitIdle#Fence");
+
+  // Keep trying to submit more workload to the GPU until reaching the deadline.
+  do {
+    RETURN_IF_ERROR(ProcessDeferredSubmissions().status());
+
+    uint64_t timeout_nanos;
+    if (deadline == absl::InfinitePast()) {
+      // Do not wait.
+      timeout_nanos = 0;
+    } else {
+      // Convert to relative time in nanoseconds.
+      // The implementation may not wait with this granularity (like, by
+      // 10000x).
+      absl::Time now = absl::Now();
+      if (deadline < now) {
+        return DeadlineExceededErrorBuilder(IREE_LOC)
+               << "Deadline exceeded waiting for idle";
+      }
+      timeout_nanos =
+          static_cast<uint64_t>(absl::ToInt64Nanoseconds(deadline - now));
+    }
+
+    if (pending_fences_.empty()) continue;
+
+    std::vector<VkFence> fences;
+    fences.reserve(pending_fences_.size());
+    for (const auto& fence : pending_fences_) fences.push_back(fence->value());
+
+    VkResult result =
+        syms()->vkWaitForFences(*logical_device_, fences.size(), fences.data(),
+                                /*waitAll=*/VK_TRUE, timeout_nanos);
+
+    switch (result) {
+      case VK_SUCCESS:
+        pending_fences_.clear();
+        break;
+      case VK_TIMEOUT:
+        return DeadlineExceededErrorBuilder(IREE_LOC)
+               << "Deadline exceeded waiting for idle";
+      default:
+        return VkResultToStatus(result);
+    }
+    // As long as there is submitted or deferred work still pending.
+  } while (!pending_fences_.empty() || !deferred_submissions_.empty());
+
+  return OkStatus();
+}
+
+Status SerializingCommandQueue::AdvanceQueueSubmission() {
+  absl::MutexLock lock(&mutex_);
+  // The returned value just indicates whether there were newly ready
+  // submissions gotten submitted to the GPU. Other callers might be
+  // interested in that information but for this API we just want to advance
+  // queue submisison if possible. So we ignore it here.
+  ASSIGN_OR_RETURN(std::ignore, ProcessDeferredSubmissions());
+  return OkStatus();
+}
+
+void SerializingCommandQueue::AbortQueueSubmission() {
+  absl::MutexLock lock(&mutex_);
+
+  // We have fences in deferred_submissions_ but they are not submitted to GPU
+  // yet so we don't need to reset.
+  deferred_submissions_.clear();
+
+  std::vector<VkFence> fences;
+  fences.reserve(pending_fences_.size());
+  for (const auto& fence : pending_fences_) fences.push_back(fence->value());
+
+  syms()->vkWaitForFences(*logical_device_, fences.size(), fences.data(),
+                          /*waitAll=*/VK_TRUE, /*timeout=*/UINT64_MAX);
+  // Clear the list. Fences will be automatically returned back to the queue
+  // after refcount reaches 0.
+  pending_fences_.clear();
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
diff --git a/iree/hal/vulkan/serializing_command_queue.h b/iree/hal/vulkan/serializing_command_queue.h
new file mode 100644
index 0000000..e38643b
--- /dev/null
+++ b/iree/hal/vulkan/serializing_command_queue.h
@@ -0,0 +1,111 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_VULKAN_SERIALIZING_COMMAND_QUEUE_H_
+#define IREE_HAL_VULKAN_SERIALIZING_COMMAND_QUEUE_H_
+
+#include <vulkan/vulkan.h>
+
+#include <memory>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "iree/base/intrusive_list.h"
+#include "iree/base/ref_ptr.h"
+#include "iree/base/status.h"
+#include "iree/hal/command_buffer.h"
+#include "iree/hal/command_queue.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/timepoint_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// A command queue that potentially defers and serializes command buffer
+// submission to the GPU.
+//
+// This command queue is designed to be used together with emulated timeline
+// semaphores. Timeline semaphores can follow wait-before-signal submission
+// order but binary `VkSemaphore` cannot. So when emulating timeline semaphores
+// with binary `VkSemaphore`s and `VkFence`s, we need to make sure no
+// wait-before-signal submission order occur for binary `VkSemaphore`s. The way
+// to enforce that is to defer the submission until we can be certain that the
+// `VkSemaphore`s emulating time points in the timeline are all *submitted* to
+// the GPU.
+class SerializingCommandQueue final : public CommandQueue {
+ public:
+  SerializingCommandQueue(std::string name,
+                          CommandCategoryBitfield supported_categories,
+                          const ref_ptr<VkDeviceHandle>& logical_device,
+                          const ref_ptr<TimePointFencePool>& fence_pool,
+                          VkQueue queue);
+  ~SerializingCommandQueue() override;
+
+  const ref_ptr<DynamicSymbols>& syms() const {
+    return logical_device_->syms();
+  }
+
+  Status Submit(absl::Span<const SubmissionBatch> batches) override;
+
+  Status WaitIdle(absl::Time deadline) override;
+
+  // Releases all deferred submissions ready to submit to the GPU.
+  Status AdvanceQueueSubmission();
+
+  // Aborts all deferred submissions and waits for submitted work to complete.
+  void AbortQueueSubmission();
+
+ private:
+  // A submission batch together with the fence to singal its status.
+  struct FencedSubmission : IntrusiveLinkBase<void> {
+    SubmissionBatch batch;
+    ref_ptr<TimePointFence> fence;
+
+    FencedSubmission(const SubmissionBatch& batch,
+                     ref_ptr<TimePointFence> fence)
+        : batch(batch), fence(std::move(fence)) {}
+  };
+
+  // Processes deferred submissions in this queue and returns whether there are
+  // new workload submitted to the GPU if no errors happen.
+  StatusOr<bool> ProcessDeferredSubmissions()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  ref_ptr<VkDeviceHandle> logical_device_;
+
+  ref_ptr<TimePointFencePool> fence_pool_;
+
+  mutable absl::Mutex mutex_;
+
+  // A list of fences that are submitted to GPU.
+  absl::InlinedVector<ref_ptr<TimePointFence>, 4> pending_fences_
+      ABSL_GUARDED_BY(mutex_);
+  // A list of deferred submissions that haven't been submitted to GPU.
+  IntrusiveList<std::unique_ptr<FencedSubmission>> deferred_submissions_
+      ABSL_GUARDED_BY(mutex_);
+
+  // VkQueue needs to be externally synchronized.
+  VkQueue queue_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_SERIALIZING_COMMAND_QUEUE_H_
diff --git a/iree/hal/vulkan/timepoint_util.cc b/iree/hal/vulkan/timepoint_util.cc
new file mode 100644
index 0000000..c212856
--- /dev/null
+++ b/iree/hal/vulkan/timepoint_util.cc
@@ -0,0 +1,226 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/hal/vulkan/timepoint_util.h"
+
+#include <memory>
+
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "absl/utility/utility.h"
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/status_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// static
+void TimePointFence::Delete(TimePointFence* ptr) {
+  ptr->pool()->ReleaseResolved(ptr);
+}
+
+VkResult TimePointFence::GetStatus() {
+  absl::MutexLock lock(&status_mutex_);
+  if (status_ == VK_NOT_READY) {
+    const auto& device = pool()->logical_device();
+    status_ = device->syms()->vkGetFenceStatus(*device, fence_);
+  }
+  return status_;
+}
+
+// static
+StatusOr<ref_ptr<TimePointFencePool>> TimePointFencePool::Create(
+    ref_ptr<VkDeviceHandle> logical_device) {
+  IREE_TRACE_SCOPE0("TimePointFencePool::Create");
+  ref_ptr<TimePointFencePool> pool(
+      new TimePointFencePool(std::move(logical_device)));
+  RETURN_IF_ERROR(pool->PreallocateFences());
+  return pool;
+}
+
+TimePointFencePool::~TimePointFencePool() {
+  IREE_TRACE_SCOPE0("TimePointFencePool::dtor");
+
+  absl::MutexLock lock(&mutex_);
+  int free_count = 0;
+  for (auto* fence : free_fences_) {
+    syms()->vkDestroyFence(*logical_device_, fence->value(),
+                           logical_device_->allocator());
+    ++free_count;
+  }
+  DCHECK_EQ(free_count, kMaxInFlightFenceCount);
+  free_fences_.clear();
+}
+
+StatusOr<ref_ptr<TimePointFence>> TimePointFencePool::Acquire() {
+  IREE_TRACE_SCOPE0("TimePointFencePool::Acquire");
+
+  absl::MutexLock lock(&mutex_);
+  if (free_fences_.empty()) {
+    return ResourceExhaustedErrorBuilder(IREE_LOC)
+           << "Fence pool out of free fences";
+  }
+
+  auto* fence = free_fences_.front();
+  free_fences_.pop_front();
+  return add_ref(fence);
+}
+
+void TimePointFencePool::ReleaseResolved(TimePointFence* fence) {
+  IREE_TRACE_SCOPE0("TimePointFencePool::ReleaseResolved");
+  VkFence f = fence->value();
+  syms()->vkResetFences(*logical_device_, 1, &f);
+  absl::MutexLock lock(&mutex_);
+  free_fences_.push_back(fence);
+}
+
+TimePointFencePool::TimePointFencePool(ref_ptr<VkDeviceHandle> logical_device)
+    : logical_device_(std::move(logical_device)) {}
+
+const ref_ptr<DynamicSymbols>& TimePointFencePool::syms() const {
+  return logical_device_->syms();
+}
+
+Status TimePointFencePool::PreallocateFences() {
+  IREE_TRACE_SCOPE0("TimePointFencePool::PreallocateFences");
+
+  VkFenceCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+  create_info.pNext = nullptr;
+  create_info.flags = 0;
+
+  std::array<std::unique_ptr<TimePointFence>, kMaxInFlightFenceCount> fences;
+  {
+    absl::MutexLock lock(&mutex_);
+    for (int i = 0; i < fences.size(); ++i) {
+      VkFence fence = VK_NULL_HANDLE;
+      VK_RETURN_IF_ERROR(syms()->vkCreateFence(*logical_device_, &create_info,
+                                               logical_device_->allocator(),
+                                               &fence));
+      fences[i].reset(new TimePointFence(this, fence));
+    }
+  }
+
+  for (int i = 0; i < fences.size(); ++i) {
+    // The `TimePointFence`s was created with an initial ref-count of one.
+    // Decrease explicitly to zero so that later we can rely on the ref-count
+    // reaching zero to auto-release the `TimePointFence` back to the free
+    // list. As a nice side effect, this will also initialize the free list
+    // with all newly created fences.
+    // TODO: Might want to avoid acquiring and releasing the mutex for each
+    // fence.
+    fences[i].release()->ReleaseReference();
+  }
+
+  return OkStatus();
+}
+
+// static
+StatusOr<ref_ptr<TimePointSemaphorePool>> TimePointSemaphorePool::Create(
+    ref_ptr<VkDeviceHandle> logical_device) {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::Create");
+  ref_ptr<TimePointSemaphorePool> pool(
+      new TimePointSemaphorePool(std::move(logical_device)));
+  RETURN_IF_ERROR(pool->PreallocateSemaphores());
+  return pool;
+}
+
+TimePointSemaphorePool::~TimePointSemaphorePool() {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::dtor");
+
+  absl::MutexLock lock(&mutex_);
+
+  DCHECK_EQ(free_semaphores_.size(), kMaxInFlightSemaphoreCount);
+  free_semaphores_.clear();
+
+  for (auto& semaphore : storage_) {
+    syms()->vkDestroySemaphore(*logical_device_, semaphore.semaphore,
+                               logical_device_->allocator());
+  }
+}
+
+StatusOr<TimePointSemaphore*> TimePointSemaphorePool::Acquire() {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::Acquire");
+
+  absl::MutexLock lock(&mutex_);
+  if (free_semaphores_.empty()) {
+    return ResourceExhaustedErrorBuilder(IREE_LOC)
+           << "Semaphore pool out of free semaphores";
+  }
+
+  auto* semaphore = free_semaphores_.front();
+  free_semaphores_.pop_front();
+  return semaphore;
+}
+
+void TimePointSemaphorePool::ReleaseResolved(
+    IntrusiveList<TimePointSemaphore>* semaphores) {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::ReleaseResolved");
+
+  for (auto* semaphore : *semaphores) {
+    DCHECK(!semaphore->signal_fence && !semaphore->wait_fence);
+    semaphore->value = UINT64_MAX;
+  }
+
+  absl::MutexLock lock(&mutex_);
+  free_semaphores_.merge_from(semaphores);
+}
+
+void TimePointSemaphorePool::ReleaseUnresolved(
+    IntrusiveList<TimePointSemaphore>* semaphores) {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::ReleaseUnresolved");
+
+  for (auto* semaphore : *semaphores) {
+    semaphore->signal_fence = nullptr;
+    semaphore->wait_fence = nullptr;
+    semaphore->value = UINT64_MAX;
+  }
+
+  absl::MutexLock lock(&mutex_);
+  free_semaphores_.merge_from(semaphores);
+}
+
+TimePointSemaphorePool::TimePointSemaphorePool(
+    ref_ptr<VkDeviceHandle> logical_device)
+    : logical_device_(std::move(logical_device)) {}
+
+const ref_ptr<DynamicSymbols>& TimePointSemaphorePool::syms() const {
+  return logical_device_->syms();
+}
+
+Status TimePointSemaphorePool::PreallocateSemaphores() {
+  IREE_TRACE_SCOPE0("TimePointSemaphorePool::PreallocateSemaphores");
+
+  VkSemaphoreCreateInfo create_info;
+  create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+  create_info.pNext = nullptr;
+  create_info.flags = 0;
+
+  absl::MutexLock lock(&mutex_);
+  for (int i = 0; i < kMaxInFlightSemaphoreCount; ++i) {
+    auto* semaphore = &storage_[i];
+    VK_RETURN_IF_ERROR(syms()->vkCreateSemaphore(*logical_device_, &create_info,
+                                                 logical_device_->allocator(),
+                                                 &semaphore->semaphore));
+    free_semaphores_.push_back(semaphore);
+  }
+
+  return OkStatus();
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
diff --git a/iree/hal/vulkan/timepoint_util.h b/iree/hal/vulkan/timepoint_util.h
new file mode 100644
index 0000000..e2cb7df
--- /dev/null
+++ b/iree/hal/vulkan/timepoint_util.h
@@ -0,0 +1,210 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_HAL_VULKAN_TIMEPOINT_UTIL_H_
+#define IREE_HAL_VULKAN_TIMEPOINT_UTIL_H_
+
+#include <vulkan/vulkan.h>
+
+#include <atomic>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/synchronization/mutex.h"
+#include "iree/base/intrusive_list.h"
+#include "iree/base/ref_ptr.h"
+#include "iree/base/status.h"
+#include "iree/hal/vulkan/handle_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+class TimePointFencePool;
+class TimePointSemaphorePool;
+
+// A fence used for tracking progress of timeline semaphores.
+//
+// Each queue submission gets a new `VkFence` associated with it so that we can
+// later query the `VkFence` on CPU to know what time points were signaled for
+// timeline semaphores.
+//
+// Ref-counting allows the fence to be associated with multiple time points from
+// different timelines without worrying about ownership complexity.
+//
+// This is expected to used together with `TimePointFencePool` and must be
+// externally synchronized via `TimePointFencePool`'s mutex.
+class TimePointFence final : public RefObject<TimePointFence>,
+                             public IntrusiveLinkBase<void> {
+ public:
+  TimePointFence(TimePointFencePool* pool, VkFence fence)
+      : pool_(pool), fence_(fence) {}
+
+  TimePointFence(TimePointFence&& that) = delete;
+  TimePointFence& operator=(TimePointFence&&) = delete;
+
+  TimePointFence(const TimePointFence&) = delete;
+  TimePointFence& operator=(const TimePointFence&) = delete;
+
+  // Returns this fence to the pool on destruction.
+  static void Delete(TimePointFence* ptr);
+
+  VkFence value() const noexcept { return fence_; }
+  operator VkFence() const noexcept { return fence_; }
+
+  // Gets the status of this fence object. This might issue an Vulkan API call
+  // under the hood.
+  VkResult GetStatus();
+
+  // Returns the pool from which this fence comes.
+  TimePointFencePool* pool() const { return pool_; }
+
+ private:
+  // The pool from which this fence comes.
+  TimePointFencePool* pool_;
+
+  // Allocated fence that associated with a bunch of time point(s) of
+  // timeline(s). This is passed to queue submission so that we can track the
+  // timeline(s) progress on CPU and schedule work.
+  VkFence fence_;
+
+  // The fence's status.
+  absl::Mutex status_mutex_;
+  VkResult status_ ABSL_GUARDED_BY(status_mutex_) = VK_NOT_READY;
+};
+
+// A semaphore used for emulating a specific time point of timeline semaphores.
+//
+// Each signaled time point in a timeline semaphore is emulated with a new
+// binary `VkSemaphore` associated with queue submission. These time point
+// semaphores are stored in `EmulatedTimelineSemaphore` to quickly scan and
+// process signaled values.
+//
+// This is expected to used together with `TimePointSemaphorePool` and
+// `EmulatedTimelineSemaphore` and must be externally synchronized via their
+// mutexes.
+struct TimePointSemaphore final : public IntrusiveLinkBase<void> {
+  // Allocated binary semaphore that represents a time point in the timeline.
+  // This is passed to queue submission.
+  VkSemaphore semaphore = VK_NULL_HANDLE;
+
+  // Value of the timeline should be at when the binary semaphore is signaled.
+  uint64_t value = UINT64_MAX;
+
+  // The fence associated with the queue submission signaling this semaphore.
+  // nullptr means this binary semaphore has not been submitted to GPU.
+  ref_ptr<TimePointFence> signal_fence = nullptr;
+
+  // The fence associated with the queue submission waiting this semaphore.
+  // nullptr means this binary semaphore has not been waited by any queue
+  // submission.
+  ref_ptr<TimePointFence> wait_fence = nullptr;
+};
+
+// A pool of `VkFence`s that can be used by `EmulatedTimelineSemaphore` to track
+// timeline progress on CPU. Each `VkFence` can be used to query the status of
+// all the semaphores in the same submission to a `VkQueue`.
+class TimePointFencePool final : public RefObject<TimePointFencePool> {
+ public:
+  static constexpr int kMaxInFlightFenceCount = 32;
+
+  // Creates a new pool and pre-allocates `kMaxInFlightFenceCount` fences.
+  static StatusOr<ref_ptr<TimePointFencePool>> Create(
+      ref_ptr<VkDeviceHandle> logical_device);
+
+  ~TimePointFencePool();
+
+  // Acquires a fence from the pool for use by the caller. The fence is
+  // guaranteed to be in unsignaled state and not in-flight on GPU.
+  //
+  // Returns RESOURCE_EXHAUSTED if the pool has no more available fences.
+  // Callers are expected to handle this by waiting on previous fences or for
+  // complete device idle. Yes, that's as bad as it sounds, and if we start
+  // seeing that we should bump up the max count.
+  StatusOr<ref_ptr<TimePointFence>> Acquire();
+
+  // Releases one fence back to the pool. The fence must either be signaled or
+  // not be in flight on GPU.
+  void ReleaseResolved(TimePointFence* fence);
+
+  const ref_ptr<VkDeviceHandle>& logical_device() const {
+    return logical_device_;
+  }
+
+ private:
+  explicit TimePointFencePool(ref_ptr<VkDeviceHandle> logical_device);
+
+  const ref_ptr<DynamicSymbols>& syms() const;
+
+  Status PreallocateFences() ABSL_LOCKS_EXCLUDED(mutex_);
+
+  ref_ptr<VkDeviceHandle> logical_device_;
+
+  absl::Mutex mutex_;
+
+  IntrusiveList<TimePointFence> free_fences_ ABSL_GUARDED_BY(mutex_);
+};
+
+// A pool of `VkSemaphore`s that can be used by `EmulatedTimelineSemaphore` to
+// simulate individual timeline value signaling.
+class TimePointSemaphorePool final : public RefObject<TimePointSemaphorePool> {
+ public:
+  static constexpr int kMaxInFlightSemaphoreCount = 64;
+
+  // Creates a new pool and pre-allocates `kMaxInFlightSemaphoreCount` binary
+  // semaphores.
+  static StatusOr<ref_ptr<TimePointSemaphorePool>> Create(
+      ref_ptr<VkDeviceHandle> logical_device);
+
+  ~TimePointSemaphorePool();
+
+  // Acquires a binary semaphore from the pool for use by the caller. The
+  // semaphore is guaranteed to be in unsignaled state and not in-flight on GPU.
+  //
+  // Returns RESOURCE_EXHAUSTED if the pool has no more available semaphores.
+  // Callers are expected to handle this by waiting on previous fences or for
+  // complete device idle. Yes, that's as bad as it sounds, and if we start
+  // seeing that we should bump up the max count.
+  StatusOr<TimePointSemaphore*> Acquire();
+
+  // Releases one or more semaphores back to the pool. The binary semaphore must
+  // be unsignaled and not in flight on GPU.
+  void ReleaseResolved(IntrusiveList<TimePointSemaphore>* semaphores);
+
+  // Releases one or more semaphores back to the pool. These may be in any state
+  // and will be assumed as untouchable; the pool will unconditionally recycle
+  // them.
+  void ReleaseUnresolved(IntrusiveList<TimePointSemaphore>* semaphores);
+
+ private:
+  explicit TimePointSemaphorePool(ref_ptr<VkDeviceHandle> logical_device);
+
+  const ref_ptr<DynamicSymbols>& syms() const;
+
+  Status PreallocateSemaphores() ABSL_LOCKS_EXCLUDED(mutex_);
+
+  ref_ptr<VkDeviceHandle> logical_device_;
+
+  absl::Mutex mutex_;
+
+  std::array<TimePointSemaphore, kMaxInFlightSemaphoreCount> storage_
+      ABSL_GUARDED_BY(mutex_);
+  IntrusiveList<TimePointSemaphore> free_semaphores_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_TIMEPOINT_UTIL_H_
diff --git a/iree/hal/vulkan/vulkan_device.cc b/iree/hal/vulkan/vulkan_device.cc
index 3c7e37b..42b56c4 100644
--- a/iree/hal/vulkan/vulkan_device.cc
+++ b/iree/hal/vulkan/vulkan_device.cc
@@ -30,12 +30,14 @@
 #include "iree/hal/vulkan/direct_command_buffer.h"
 #include "iree/hal/vulkan/direct_command_queue.h"
 #include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/emulated_timeline_semaphore.h"
 #include "iree/hal/vulkan/extensibility_util.h"
 #include "iree/hal/vulkan/native_descriptor_set.h"
 #include "iree/hal/vulkan/native_event.h"
 #include "iree/hal/vulkan/native_timeline_semaphore.h"
 #include "iree/hal/vulkan/pipeline_cache.h"
 #include "iree/hal/vulkan/pipeline_executable_layout.h"
+#include "iree/hal/vulkan/serializing_command_queue.h"
 #include "iree/hal/vulkan/status_util.h"
 #include "iree/hal/vulkan/vma_allocator.h"
 
@@ -164,6 +166,7 @@
     const DeviceInfo& device_info,
     const ref_ptr<VkDeviceHandle>& logical_device,
     const QueueSet& compute_queue_set, const QueueSet& transfer_queue_set,
+    const ref_ptr<TimePointFencePool>& fence_pool,
     const ref_ptr<DynamicSymbols>& syms) {
   absl::InlinedVector<std::unique_ptr<CommandQueue>, 4> command_queues;
 
@@ -175,10 +178,17 @@
     syms->vkGetDeviceQueue(*logical_device,
                            compute_queue_set.queue_family_index, i, &queue);
     std::string queue_name = absl::StrCat(device_info.name(), ":d", i);
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+    command_queues.push_back(absl::make_unique<SerializingCommandQueue>(
+        std::move(queue_name),
+        CommandCategory::kDispatch | CommandCategory::kTransfer, logical_device,
+        fence_pool, queue));
+#else
     command_queues.push_back(absl::make_unique<DirectCommandQueue>(
         std::move(queue_name),
         CommandCategory::kDispatch | CommandCategory::kTransfer, logical_device,
         queue));
+#endif  // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
   }
 
   uint64_t transfer_queue_count = CountOnes64(transfer_queue_set.queue_indices);
@@ -189,9 +199,15 @@
     syms->vkGetDeviceQueue(*logical_device,
                            transfer_queue_set.queue_family_index, i, &queue);
     std::string queue_name = absl::StrCat(device_info.name(), ":t", i);
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+    command_queues.push_back(absl::make_unique<SerializingCommandQueue>(
+        std::move(queue_name), CommandCategory::kTransfer, logical_device,
+        fence_pool, queue));
+#else
     command_queues.push_back(absl::make_unique<DirectCommandQueue>(
         std::move(queue_name), CommandCategory::kTransfer, logical_device,
         queue));
+#endif  // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
   }
 
   return command_queues;
@@ -354,14 +370,27 @@
   for (uint32_t i = 0; i < queue_family_info.transfer_queue_count; ++i) {
     transfer_queue_set.queue_indices |= 1 << (i + base_queue_index);
   }
-  auto command_queues = CreateCommandQueues(
-      device_info, logical_device, compute_queue_set, transfer_queue_set, syms);
+
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+  ASSIGN_OR_RETURN(auto semaphore_pool,
+                   TimePointSemaphorePool::Create(add_ref(logical_device)));
+  ASSIGN_OR_RETURN(auto fence_pool,
+                   TimePointFencePool::Create(add_ref(logical_device)));
+#else
+  ref_ptr<TimePointSemaphorePool> semaphore_pool = nullptr;
+  ref_ptr<TimePointFencePool> fence_pool = nullptr;
+#endif  // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+
+  auto command_queues =
+      CreateCommandQueues(device_info, logical_device, compute_queue_set,
+                          transfer_queue_set, fence_pool, syms);
 
   return assign_ref(new VulkanDevice(
       std::move(driver), device_info, physical_device,
       std::move(logical_device), std::move(allocator),
       std::move(command_queues), std::move(dispatch_command_pool),
-      std::move(transfer_command_pool), debug_capture_manager));
+      std::move(transfer_command_pool), std::move(semaphore_pool),
+      std::move(fence_pool), debug_capture_manager));
 }
 
 // static
@@ -421,13 +450,25 @@
                          device_handle, transfer_queue_set.queue_family_index));
   }
 
-  auto command_queues = CreateCommandQueues(
-      device_info, device_handle, compute_queue_set, transfer_queue_set, syms);
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+  ASSIGN_OR_RETURN(auto semaphore_pool,
+                   TimePointSemaphorePool::Create(add_ref(device_handle)));
+  ASSIGN_OR_RETURN(auto fence_pool,
+                   TimePointFencePool::Create(add_ref(device_handle)));
+#else
+  ref_ptr<TimePointSemaphorePool> semaphore_pool = nullptr;
+  ref_ptr<TimePointFencePool> fence_pool = nullptr;
+#endif  // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+
+  auto command_queues =
+      CreateCommandQueues(device_info, device_handle, compute_queue_set,
+                          transfer_queue_set, fence_pool, syms);
 
   return assign_ref(new VulkanDevice(
       std::move(driver), device_info, physical_device, std::move(device_handle),
       std::move(allocator), std::move(command_queues),
       std::move(dispatch_command_pool), std::move(transfer_command_pool),
+      std::move(semaphore_pool), std::move(fence_pool),
       /*debug_capture_manager=*/nullptr));
 }
 
@@ -438,6 +479,8 @@
     absl::InlinedVector<std::unique_ptr<CommandQueue>, 4> command_queues,
     ref_ptr<VkCommandPoolHandle> dispatch_command_pool,
     ref_ptr<VkCommandPoolHandle> transfer_command_pool,
+    ref_ptr<TimePointSemaphorePool> semaphore_pool,
+    ref_ptr<TimePointFencePool> fence_pool,
     DebugCaptureManager* debug_capture_manager)
     : Device(device_info),
       driver_(std::move(driver)),
@@ -449,6 +492,8 @@
           make_ref<DescriptorPoolCache>(add_ref(logical_device_))),
       dispatch_command_pool_(std::move(dispatch_command_pool)),
       transfer_command_pool_(std::move(transfer_command_pool)),
+      semaphore_pool_(std::move(semaphore_pool)),
+      fence_pool_(std::move(fence_pool)),
       debug_capture_manager_(debug_capture_manager) {
   // Populate the queue lists based on queue capabilities.
   for (auto& command_queue : command_queues_) {
@@ -650,8 +695,33 @@
 StatusOr<ref_ptr<Semaphore>> VulkanDevice::CreateSemaphore(
     uint64_t initial_value) {
   IREE_TRACE_SCOPE0("VulkanDevice::CreateSemaphore");
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+  return EmulatedTimelineSemaphore::Create(
+      add_ref(logical_device_),
+      // Triggers necessary processing on all queues due to new values gotten
+      // signaled for the given timeline |semaphore|.
+      [this](Semaphore* /*semaphore*/) -> Status {
+        IREE_TRACE_SCOPE0("<lambda>::OnSemaphoreSignal");
+        for (const auto& queue : command_queues_) {
+          RETURN_IF_ERROR(static_cast<SerializingCommandQueue*>(queue.get())
+                              ->AdvanceQueueSubmission());
+        }
+        return OkStatus();
+      },
+      // Triggers necessary processing on all queues due to failures for the
+      // given timeline |semaphore|.
+      [this](Semaphore* /*semaphore*/) {
+        IREE_TRACE_SCOPE0("<lambda>::OnSemaphoreFailure");
+        for (const auto& queue : command_queues_) {
+          static_cast<SerializingCommandQueue*>(queue.get())
+              ->AbortQueueSubmission();
+        }
+      },
+      add_ref(semaphore_pool_), initial_value);
+#else
   return NativeTimelineSemaphore::Create(add_ref(logical_device_),
                                          initial_value);
+#endif  // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
 }
 
 Status VulkanDevice::WaitAllSemaphores(
@@ -672,6 +742,23 @@
                                     VkSemaphoreWaitFlags wait_flags) {
   IREE_TRACE_SCOPE0("VulkanDevice::WaitSemaphores");
 
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
+
+  // TODO(antiagainst): We actually should get the fences associated with the
+  // emulated timeline semaphores so that we can wait them in a bunch. This
+  // implementation is problematic if we wait to wait any and we have the first
+  // semaphore taking extra long time but the following ones signal quickly.
+  for (int i = 0; i < semaphores.size(); ++i) {
+    auto* semaphore =
+        static_cast<EmulatedTimelineSemaphore*>(semaphores[i].semaphore);
+    RETURN_IF_ERROR(semaphore->Wait(semaphores[i].value, deadline));
+    if (wait_flags & VK_SEMAPHORE_WAIT_ANY_BIT) return OkStatus();
+  }
+
+  return OkStatus();
+
+#else
+
   absl::InlinedVector<VkSemaphore, 4> semaphore_handles(semaphores.size());
   absl::InlinedVector<uint64_t, 4> semaphore_values(semaphores.size());
   for (int i = 0; i < semaphores.size(); ++i) {
@@ -714,6 +801,8 @@
   // semaphores we waited on (including those already expired above).
 
   return OkStatus();
+
+#endif  // IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES
 }
 
 Status VulkanDevice::WaitIdle(absl::Time deadline) {
diff --git a/iree/hal/vulkan/vulkan_device.h b/iree/hal/vulkan/vulkan_device.h
index cfceb4f..ce7c9d7 100644
--- a/iree/hal/vulkan/vulkan_device.h
+++ b/iree/hal/vulkan/vulkan_device.h
@@ -30,6 +30,7 @@
 #include "iree/hal/semaphore.h"
 #include "iree/hal/vulkan/descriptor_pool_cache.h"
 #include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/emulated_timeline_semaphore.h"
 #include "iree/hal/vulkan/extensibility_util.h"
 #include "iree/hal/vulkan/handle_util.h"
 
@@ -119,6 +120,8 @@
       absl::InlinedVector<std::unique_ptr<CommandQueue>, 4> command_queues,
       ref_ptr<VkCommandPoolHandle> dispatch_command_pool,
       ref_ptr<VkCommandPoolHandle> transfer_command_pool,
+      ref_ptr<TimePointSemaphorePool> semaphore_pool,
+      ref_ptr<TimePointFencePool> fence_pool,
       DebugCaptureManager* debug_capture_manager);
 
   Status WaitSemaphores(absl::Span<const SemaphoreValue> semaphores,
@@ -139,6 +142,10 @@
   ref_ptr<VkCommandPoolHandle> dispatch_command_pool_;
   ref_ptr<VkCommandPoolHandle> transfer_command_pool_;
 
+  // Fields used for emulated timeline semaphores.
+  ref_ptr<TimePointSemaphorePool> semaphore_pool_;
+  ref_ptr<TimePointFencePool> fence_pool_;
+
   DebugCaptureManager* debug_capture_manager_ = nullptr;
 };
 
diff --git a/iree/hal/vulkan/vulkan_driver_module.cc b/iree/hal/vulkan/vulkan_driver_module.cc
index 4b98f58..f034127 100644
--- a/iree/hal/vulkan/vulkan_driver_module.cc
+++ b/iree/hal/vulkan/vulkan_driver_module.cc
@@ -67,9 +67,11 @@
   // promoted to core, so we list it as optional even though we require it.
   options.instance_extensibility.optional_extensions.push_back(
       VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
+#if IREE_HAL_VULKAN_EMULATE_TIMELINE_SEMAPHORES == 0
   // Timeline semaphore support is required.
   options.device_extensibility.required_extensions.push_back(
       VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME);
+#endif
 
   if (absl::GetFlag(FLAGS_vulkan_validation_layers)) {
     options.instance_extensibility.optional_layers.push_back(