Merging early work on Tracy feature branch.

This replaces WTF with Tracy, which can be enabled in cmake builds with `IREE_ENABLE_RUNTIME_TRACING`. Bazel support will come soonish. This initial commit provides a full C API for Tracy, C++ shims for the existing `IREE_TRACE_SCOPE0` macro (what's mostly used in the codebase today), and allocation tracking.

This has the side effect of removing another ALWAYSLINK library. Yay!

Progress on #1886; C API is in but still need to finish the C++ API before calling this done. Vulkan tracing will be added separately with #1937 as that requires more work.

Fixes #1480.

![image](https://user-images.githubusercontent.com/75337/82014451-7e9cf180-9631-11ea-8789-6533e86702d6.png)

Closes https://github.com/google/iree/pull/1936

PiperOrigin-RevId: 311789884
diff --git a/.gitmodules b/.gitmodules
index 16382de..8733410 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -19,10 +19,6 @@
 [submodule "third_party/vulkan_memory_allocator"]
 	path = third_party/vulkan_memory_allocator
 	url = https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator.git
-[submodule "third_party/google_tracing_framework"]
-	path = third_party/google_tracing_framework
-	url = https://github.com/google/tracing-framework.git
-	fetchRecurseSubmodules = false
 [submodule "third_party/spirv_headers"]
 	path = third_party/spirv_headers
 	url = https://github.com/KhronosGroup/SPIRV-Headers.git
@@ -48,3 +44,6 @@
 [submodule "third_party/ruy"]
 	path = third_party/ruy
 	url = https://github.com/google/ruy
+[submodule "third_party/tracy"]
+	path = third_party/tracy
+	url = https://github.com/wolfpld/tracy.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ddca5cb..7d2d3db 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,9 +31,8 @@
 set(IREE_IDE_FOLDER IREE)
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
-option(IREE_ENABLE_DEBUG "Enables debugging of the VM." ON)
+option(IREE_ENABLE_RUNTIME_TRACING "Enables instrumented runtime tracing." OFF)
 option(IREE_ENABLE_LLVM "Enables LLVM dependencies." ON)
-option(IREE_ENABLE_TRACING "Enables WTF tracing." OFF)
 
 option(IREE_BUILD_COMPILER "Builds the IREE compiler." ON)
 option(IREE_BUILD_TESTS "Builds IREE unit tests." ON)
diff --git a/SUBMODULE_VERSIONS b/SUBMODULE_VERSIONS
index 98e2374..97f4088 100644
--- a/SUBMODULE_VERSIONS
+++ b/SUBMODULE_VERSIONS
@@ -2,7 +2,6 @@
 daff5fead3fbe22c6fc58310ca3f49caf117f185 third_party/benchmark
 4c13807b7d43ff0946b7ffea0ae3aee9e611d778 third_party/dear_imgui
 97f3aa91746a7d207513a73725e92cee7c35bb87 third_party/flatbuffers
-89ca6c25bae7c11d12409739b2ef707ed9afe6c2 third_party/google_tracing_framework
 f2fb48c3b3d79a75a88a99fba6576b25d42ec528 third_party/googletest
 9d4b4f344d8ea917e082cf58d66b71c0171e1650 third_party/llvm-project
 80d452484c5409444b0ec19383faa84bb7a4d351 third_party/pybind11
@@ -11,6 +10,7 @@
 f8bf11a0253a32375c32cad92c841237b96696c0 third_party/spirv_headers
 5cf1e9a31c90ccd9ec40d0c5ef1357f6e0ec1cfd third_party/swiftshader
 9957cb60a248ba1e61d5606a3d0a189290f36b37 third_party/tensorflow
+864d86e8b6d21449474db5e9313dbff90aa9c24f third_party/tracy
 8a457f8552d8d47ce3a96ed80a714ff6396f8ad8 third_party/vulkan_extensionlayer
 9bd3f561bcee3f01d22912de10bb07ce4e23d378 third_party/vulkan_headers
 909f36b714c9239ee0b112a321220213a474ba53 third_party/vulkan_memory_allocator
diff --git a/WORKSPACE b/WORKSPACE
index 27174b0..5ed76db 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -168,12 +168,6 @@
     path = "third_party/flatbuffers",
 )
 
-maybe(new_local_repository,
-    name = "com_google_tracing_framework_cpp",
-    path = "third_party/google_tracing_framework/bindings/cpp",
-    build_file = "build_tools/third_party/google_tracing_framework_cpp/BUILD.overlay",
-)
-
 # TODO(scotttodd): TensorFlow is squatting on the vulkan_headers repo name, so
 # we use a temporary one until resolved. Theirs is set to an outdated version.
 maybe(new_local_repository,
diff --git a/bindings/python/pyiree/rt/BUILD b/bindings/python/pyiree/rt/BUILD
index cca264c..5415077 100644
--- a/bindings/python/pyiree/rt/BUILD
+++ b/bindings/python/pyiree/rt/BUILD
@@ -62,8 +62,6 @@
         ":rt_library",
         "//bindings/python/pyiree/common",
         "//iree/base:initializer",
-        "//iree/base:tracing",
-        "@com_google_tracing_framework_cpp//:tracing_framework_bindings_cpp",
     ],
 )
 
diff --git a/bindings/python/pyiree/rt/CMakeLists.txt b/bindings/python/pyiree/rt/CMakeLists.txt
index b90c93e..0bc070f 100644
--- a/bindings/python/pyiree/rt/CMakeLists.txt
+++ b/bindings/python/pyiree/rt/CMakeLists.txt
@@ -34,9 +34,6 @@
     ::rt_library
     bindings::python::pyiree::common
     iree::base::initializer
-    iree::base::tracing
-    # TODO(marbre): Add dependency
-    # "@com_google_tracing_framework_cpp//:tracing_framework_bindings_cpp",
   COPTS
     ${PYBIND_COPTS}
     ${PYBIND_EXTENSION_COPTS}
diff --git a/bindings/python/pyiree/rt/initialize_module.cc b/bindings/python/pyiree/rt/initialize_module.cc
index a5dea72..e4e3824 100644
--- a/bindings/python/pyiree/rt/initialize_module.cc
+++ b/bindings/python/pyiree/rt/initialize_module.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <mutex>  // NOLINT
-
 #include "bindings/python/pyiree/common/binding.h"
 #include "bindings/python/pyiree/common/status_utils.h"
 #include "bindings/python/pyiree/rt/function_abi.h"
@@ -21,86 +19,10 @@
 #include "bindings/python/pyiree/rt/host_types.h"
 #include "bindings/python/pyiree/rt/vm.h"
 #include "iree/base/initializer.h"
-#include "iree/base/tracing.h"
-#include "wtf/event.h"
-#include "wtf/macros.h"
 
 namespace iree {
 namespace python {
 
-namespace {
-
-// Wrapper around wtf::ScopedEvent to make it usable as a python context
-// object.
-class PyScopedEvent {
- public:
-  PyScopedEvent(std::string name_spec)
-      : scoped_event_(InternEvent(std::move(name_spec))) {}
-
-  bool Enter() {
-    if (scoped_event_) {
-      scoped_event_->Enter();
-      return true;
-    }
-    return false;
-  }
-
-  void Exit(py::args args) {
-    if (scoped_event_) scoped_event_->Leave();
-  }
-
- private:
-  static ::wtf::ScopedEvent<>* InternEvent(std::string name_spec) {
-    if (!::wtf::kMasterEnable) return nullptr;
-    std::lock_guard<std::mutex> lock(mu_);
-    auto it = scoped_event_intern_.find(name_spec);
-    if (it == scoped_event_intern_.end()) {
-      // Name spec must live forever.
-      std::string* dup_name_spec = new std::string(std::move(name_spec));
-      // So must the event.
-      auto scoped_event = new ::wtf::ScopedEvent<>(dup_name_spec->c_str());
-      scoped_event_intern_.insert(std::make_pair(*dup_name_spec, scoped_event));
-      return scoped_event;
-    } else {
-      return it->second;
-    }
-  }
-
-  static std::mutex mu_;
-  static std::unordered_map<std::string, ::wtf::ScopedEvent<>*>
-      scoped_event_intern_;
-  ::wtf::ScopedEvent<>* scoped_event_;
-};
-
-std::mutex PyScopedEvent::mu_;
-std::unordered_map<std::string, ::wtf::ScopedEvent<>*>
-    PyScopedEvent::scoped_event_intern_;
-
-void SetupTracingBindings(pybind11::module m) {
-  m.def("enable_thread", []() { WTF_AUTO_THREAD_ENABLE(); });
-  m.def("is_available", []() { return IsTracingAvailable(); });
-  m.def(
-      "flush",
-      [](absl::optional<std::string> explicit_trace_path) {
-        absl::optional<absl::string_view> sv_path;
-        if (explicit_trace_path) sv_path = explicit_trace_path;
-        FlushTrace(explicit_trace_path);
-      },
-      py::arg("explicit_trace_path") = absl::optional<absl::string_view>());
-  m.def(
-      "autoflush",
-      [](float period) { StartTracingAutoFlush(absl::Seconds(period)); },
-      py::arg("period") = 5.0f);
-  m.def("stop", []() { StopTracing(); });
-
-  py::class_<PyScopedEvent>(m, "ScopedEvent")
-      .def(py::init<std::string>())
-      .def("__enter__", &PyScopedEvent::Enter)
-      .def("__exit__", &PyScopedEvent::Exit);
-}
-
-}  // namespace
-
 PYBIND11_MODULE(binding, m) {
   IREE_RUN_MODULE_INITIALIZERS();
 
@@ -109,9 +31,6 @@
   SetupHostTypesBindings(m);
   SetupHalBindings(m);
   SetupVmBindings(m);
-
-  auto tracing_m = m.def_submodule("tracing", "IREE tracing api");
-  SetupTracingBindings(tracing_m);
 }
 
 }  // namespace python
diff --git a/build_tools/cmake/iree_copts.cmake b/build_tools/cmake/iree_copts.cmake
index b9689bd..f4f1be6 100644
--- a/build_tools/cmake/iree_copts.cmake
+++ b/build_tools/cmake/iree_copts.cmake
@@ -55,12 +55,6 @@
 set(IREE_DEFAULT_LINKOPTS "${ABSL_DEFAULT_LINKOPTS}")
 set(IREE_TEST_COPTS "${ABSL_TEST_COPTS}")
 
-if(${IREE_ENABLE_TRACING})
-  list(APPEND IREE_DEFAULT_COPTS
-    "-DGLOBAL_WTF_ENABLE=1"
-  )
-endif()
-
 #-------------------------------------------------------------------------------
 # Compiler: Clang/LLVM
 #-------------------------------------------------------------------------------
@@ -153,11 +147,3 @@
   ${PROJECT_SOURCE_DIR}/third_party/tensorflow
   ${PROJECT_BINARY_DIR}/build_tools/third_party/tensorflow
 )
-
-#-------------------------------------------------------------------------------
-# Third party: tracing
-#-------------------------------------------------------------------------------
-
-list(APPEND IREE_COMMON_INCLUDE_DIRS
-  ${PROJECT_SOURCE_DIR}/third_party/google_tracing_framework/bindings/cpp/include
-)
diff --git a/docs/repository_management.md b/docs/repository_management.md
index ba53dd1..3af849b 100644
--- a/docs/repository_management.md
+++ b/docs/repository_management.md
@@ -76,7 +76,6 @@
 4c13807b7d43ff0946b7ffea0ae3aee9e611d778 third_party/dear_imgui
 97f3aa91746a7d207513a73725e92cee7c35bb87 third_party/flatbuffers
 3d62e9545bd15c5df9ccfdd8453b93d64a6dd8eb third_party/ruy
-495ced98de99a5895e484b2e09771edb42d3c7ab third_party/google_tracing_framework
 f2fb48c3b3d79a75a88a99fba6576b25d42ec528 third_party/googletest
 a21beccea2020f950845cbb68db663d0737e174c third_party/llvm-project
 80d452484c5409444b0ec19383faa84bb7a4d351 third_party/pybind11
diff --git a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py
index eb0c72e..c32ba4c 100644
--- a/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py
+++ b/integrations/tensorflow/bindings/python/pyiree/tf/support/tf_test_utils.py
@@ -550,8 +550,6 @@
   # instances mirroring _modules_to_compile.
   compiled_modules = None
 
-  TRACE_FILE_NAME = None
-
   def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.modules = None
@@ -626,13 +624,6 @@
 
   @classmethod
   def tearDownClass(cls):
-    trace_file_name = cls.TRACE_FILE_NAME
-    if not trace_file_name:
-      trace_file_name = cls.__name__ + ".wtf-trace"
-    trace_file = os.path.join(tempfile.gettempdir(), trace_file_name)
-    print("Flushing trace file to:", trace_file)
-    rt.binding.tracing.flush(trace_file)
-    print("Flush complete")
     super().tearDownClass()
 
   def setUp(self):
diff --git a/iree/base/BUILD b/iree/base/BUILD
index 383ad54..e6f7a59 100644
--- a/iree/base/BUILD
+++ b/iree/base/BUILD
@@ -502,55 +502,8 @@
     name = "tracing",
     hdrs = ["tracing.h"],
     deps = [
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_tracing_framework_cpp//:tracing_framework_bindings_cpp",
-    ] + select({
-        "@com_google_tracing_framework_cpp//:wtf_enable": [":tracing_enabled"],
-        "//conditions:default": [":tracing_disabled"],
-    }),
-)
-
-cc_library(
-    name = "tracing_disabled",
-    srcs = [
-        "tracing.h",
-        "tracing_disabled.cc",
-    ],
-    visibility = ["//visibility:private"],
-    deps = [
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_tracing_framework_cpp//:tracing_framework_bindings_cpp",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "tracing_enabled",
-    srcs = [
-        "tracing.cc",
-        "tracing.h",
-    ],
-    visibility = ["//visibility:private"],
-    deps = [
-        ":file_io",
-        ":file_path",
-        ":initializer",
-        ":logging",
-        ":status",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/flags:flag",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_tracing_framework_cpp//:tracing_framework_bindings_cpp",
     ],
-    alwayslink = 1,
 )
 
 # Dependent code has been removed and wait_handle is currently incompatible
diff --git a/iree/base/CMakeLists.txt b/iree/base/CMakeLists.txt
index 95b5c63..16b7831 100644
--- a/iree/base/CMakeLists.txt
+++ b/iree/base/CMakeLists.txt
@@ -589,20 +589,20 @@
   PUBLIC
 )
 
-if(${IREE_ENABLE_TRACING})
+if(${IREE_ENABLE_RUNTIME_TRACING})
   iree_cc_library(
     NAME
       tracing
     HDRS
       "tracing.h"
+      "${IREE_ROOT_DIR}/third_party/tracy/Tracy.hpp"
+      "${IREE_ROOT_DIR}/third_party/tracy/TracyC.h"
+    SRCS
+      "tracing.cc"
     DEPS
-      absl::strings
-      absl::optional
-      absl::time
-      iree::base::tracing_enabled
-      # TODO(marbre): Add dependencies
-      # "@com_google_tracing_framework_cpp//:tracing_framework_bindings_cpp"
-      # "@com_google_tracing_framework_cpp//:wtf_enable": [":tracing_enabled"]
+      absl::core_headers
+    DEFINES
+      "IREE_TRACING_MODE=2"
     PUBLIC
   )
 else()
@@ -612,54 +612,8 @@
     HDRS
       "tracing.h"
     DEPS
-      absl::strings
-      absl::time
-      absl::optional
-      iree::base::tracing_disabled
-    PUBLIC
-  )
-endif()
-
-iree_cc_library(
-  NAME
-    tracing_disabled
-  HDRS
-    "tracing.h"
-  SRCS
-    "tracing_disabled.cc"
-  DEPS
-    absl::flags
-    absl::strings
-    absl::time
-    absl::optional
-    # TODO(marbre): Add dependency
-    # "@com_google_tracing_framework_cpp//:tracing_framework_bindings_cpp"
-  ALWAYSLINK
-)
-
-if(${IREE_ENABLE_TRACING})
-  iree_cc_library(
-    NAME
-      tracing_enabled
-    HDRS
-      "tracing.h"
-    SRCS
-      "tracing_enabled.cc"
-    DEPS
       absl::core_headers
-      absl::flags
-      absl::strings
-      absl::synchronization
-      absl::time
-      absl::optional
-      iree::base::file_io
-      iree::base::file_path
-      iree::base::initializer
-      iree::base::logging
-      iree::base::status
-      # TODO(marbre): Add dependency
-      # "@com_google_tracing_framework_cpp//:tracing_framework_bindings_cpp"
-    ALWAYSLINK
+    PUBLIC
   )
 endif()
 
diff --git a/iree/base/api.cc b/iree/base/api.cc
index 6a77ef7..4047dec 100644
--- a/iree/base/api.cc
+++ b/iree/base/api.cc
@@ -133,6 +133,8 @@
     return IREE_STATUS_RESOURCE_EXHAUSTED;
   }
 
+  IREE_TRACE_ALLOC(ptr, byte_length);
+
   *out_ptr = ptr;
   return IREE_STATUS_OK;
 }
@@ -140,6 +142,7 @@
 IREE_API_EXPORT iree_status_t IREE_API_CALL
 iree_allocator_system_free(void* self, void* ptr) {
   IREE_TRACE_SCOPE0("iree_allocator_system_free");
+  IREE_TRACE_FREE(ptr);
   if (ptr) {
     std::free(ptr);
   }
diff --git a/iree/base/internal/BUILD b/iree/base/internal/BUILD
index 2367ec8..f246471 100644
--- a/iree/base/internal/BUILD
+++ b/iree/base/internal/BUILD
@@ -46,6 +46,7 @@
         "//iree/base:platform_headers",
         "//iree/base:status",
         "//iree/base:target_platform",
+        "//iree/base:tracing",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
diff --git a/iree/base/internal/CMakeLists.txt b/iree/base/internal/CMakeLists.txt
index 2a2c2dd..b44dfd0 100644
--- a/iree/base/internal/CMakeLists.txt
+++ b/iree/base/internal/CMakeLists.txt
@@ -44,6 +44,7 @@
     iree::base::platform_headers
     iree::base::status
     iree::base::target_platform
+    iree::base::tracing
   PUBLIC
 )
 
diff --git a/iree/base/internal/file_io_posix.cc b/iree/base/internal/file_io_posix.cc
index ee9cd4b..ac213f1 100644
--- a/iree/base/internal/file_io_posix.cc
+++ b/iree/base/internal/file_io_posix.cc
@@ -18,6 +18,7 @@
 #include "iree/base/file_io.h"
 #include "iree/base/status.h"
 #include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
 
 #if defined(IREE_PLATFORM_ANDROID) || defined(IREE_PLATFORM_APPLE) || \
     defined(IREE_PLATFORM_LINUX)
@@ -30,6 +31,7 @@
 namespace file_io {
 
 Status FileExists(const std::string& path) {
+  IREE_TRACE_SCOPE0("file_io::FileExists");
   struct stat stat_buf;
   return stat(path.c_str(), &stat_buf) == 0
              ? OkStatus()
@@ -37,6 +39,7 @@
 }
 
 StatusOr<std::string> GetFileContents(const std::string& path) {
+  IREE_TRACE_SCOPE0("file_io::GetFileContents");
   std::unique_ptr<FILE, void (*)(FILE*)> file = {std::fopen(path.c_str(), "r"),
                                                  +[](FILE* file) {
                                                    if (file) fclose(file);
@@ -70,6 +73,7 @@
 }
 
 Status SetFileContents(const std::string& path, const std::string& content) {
+  IREE_TRACE_SCOPE0("file_io::SetFileContents");
   std::unique_ptr<FILE, void (*)(FILE*)> file = {std::fopen(path.c_str(), "wb"),
                                                  +[](FILE* file) {
                                                    if (file) fclose(file);
@@ -87,6 +91,7 @@
 }
 
 Status DeleteFile(const std::string& path) {
+  IREE_TRACE_SCOPE0("file_io::DeleteFile");
   if (::remove(path.c_str()) == -1) {
     return ErrnoToCanonicalStatusBuilder(
         errno, absl::StrCat("Failed to delete file '", path, "'"), IREE_LOC);
@@ -96,6 +101,7 @@
 
 Status MoveFile(const std::string& source_path,
                 const std::string& destination_path) {
+  IREE_TRACE_SCOPE0("file_io::MoveFile");
   if (::rename(source_path.c_str(), destination_path.c_str()) == -1) {
     return ErrnoToCanonicalStatusBuilder(
         errno,
diff --git a/iree/base/internal/file_io_win32.cc b/iree/base/internal/file_io_win32.cc
index 68e044b..a36f568 100644
--- a/iree/base/internal/file_io_win32.cc
+++ b/iree/base/internal/file_io_win32.cc
@@ -18,6 +18,7 @@
 #include "iree/base/internal/file_handle_win32.h"
 #include "iree/base/platform_headers.h"
 #include "iree/base/target_platform.h"
+#include "iree/base/tracing.h"
 
 #if defined(IREE_PLATFORM_WINDOWS)
 
@@ -25,6 +26,7 @@
 namespace file_io {
 
 Status FileExists(const std::string& path) {
+  IREE_TRACE_SCOPE0("file_io::FileExists");
   DWORD attrs = ::GetFileAttributesA(path.c_str());
   if (attrs == INVALID_FILE_ATTRIBUTES) {
     return Win32ErrorToCanonicalStatusBuilder(GetLastError(), IREE_LOC)
@@ -34,6 +36,7 @@
 }
 
 StatusOr<std::string> GetFileContents(const std::string& path) {
+  IREE_TRACE_SCOPE0("file_io::GetFileContents");
   ASSIGN_OR_RETURN(auto file, FileHandle::OpenRead(std::move(path),
                                                    FILE_FLAG_SEQUENTIAL_SCAN));
   std::string result;
@@ -53,6 +56,7 @@
 }
 
 Status SetFileContents(const std::string& path, const std::string& content) {
+  IREE_TRACE_SCOPE0("file_io::SetFileContents");
   ASSIGN_OR_RETURN(auto file, FileHandle::OpenWrite(std::move(path), 0));
   if (::WriteFile(file->handle(), content.data(), content.size(), NULL, NULL) ==
       FALSE) {
@@ -64,6 +68,7 @@
 }
 
 Status DeleteFile(const std::string& path) {
+  IREE_TRACE_SCOPE0("file_io::DeleteFile");
   if (::DeleteFileA(path.c_str()) == FALSE) {
     return Win32ErrorToCanonicalStatusBuilder(GetLastError(), IREE_LOC)
            << "Unable to delete/access file: " << path;
@@ -73,6 +78,7 @@
 
 Status MoveFile(const std::string& source_path,
                 const std::string& destination_path) {
+  IREE_TRACE_SCOPE0("file_io::MoveFile");
   if (::MoveFileA(source_path.c_str(), destination_path.c_str()) == FALSE) {
     return Win32ErrorToCanonicalStatusBuilder(GetLastError(), IREE_LOC)
            << "Unable to move file " << source_path << " to "
diff --git a/iree/base/tracing.cc b/iree/base/tracing.cc
index b416526..7049d39 100644
--- a/iree/base/tracing.cc
+++ b/iree/base/tracing.cc
@@ -12,212 +12,143 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Force the header to detect WTF_ENABLE so that this library builds
-// (for when building recursively).
-#if !defined(WTF_ENABLE)
-#define WTF_ENABLE
-#endif
-
 #include "iree/base/tracing.h"
 
-#include <thread>  // NOLINT
+// Textually include the Tracy implementation.
+// We do this here instead of relying on an external build target so that we can
+// ensure our configuration specified in tracing.h is picked up.
+#if IREE_TRACING_FEATURES != 0
+#include "third_party/tracy/TracyClient.cpp"
+#endif  // IREE_TRACING_FEATURES
 
-#include "absl/base/attributes.h"
-#include "absl/base/const_init.h"
-#include "absl/base/thread_annotations.h"
-#include "absl/flags/flag.h"
-#include "absl/strings/str_cat.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/time/clock.h"
-#include "iree/base/file_io.h"
-#include "iree/base/file_path.h"
-#include "iree/base/initializer.h"
-#include "iree/base/logging.h"
-#include "iree/base/status.h"
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
 
-ABSL_FLAG(int32_t, iree_trace_file_period, 5,
-          "Seconds between automatic flushing of WTF trace files. 0 to "
-          "disable auto-flush.");
-ABSL_FLAG(std::string, iree_trace_file, "",
-          "wtf-trace file to save if --define=GLOBAL_WTF_ENABLE=1 was used "
-          "when building.");
+#if IREE_TRACING_FEATURES != 0
 
-namespace iree {
-namespace {
-
-// Guards global WTF state (like the flush fiber and IO).
-ABSL_CONST_INIT absl::Mutex global_tracing_mutex(absl::kConstInit);
-
-// True when tracing has been enabled and initialized.
-bool global_tracing_initialized ABSL_GUARDED_BY(global_tracing_mutex) = false;
-
-// If there is an existing file at the given path back it up by moving it aside.
-// Only kMaxBackups will be kept to avoid unbounded growth.
-void RollTraceFiles(const std::string& path) {
-  std::string path_stem = file_path::JoinPaths(file_path::DirectoryName(path),
-                                               file_path::Stem(path));
-  const int kMaxBackups = 5;
-  for (int i = kMaxBackups; i >= 0; i--) {
-    std::string source_name;
-    if (i > 0) {
-      source_name = absl::StrCat(path_stem, ".", i, ".wtf-trace");
-    } else {
-      source_name = path;
-    }
-    if (!file_io::FileExists(source_name).ok()) {
-      continue;
-    }
-
-    Status status;
-    if (i == kMaxBackups) {
-      status = file_io::DeleteFile(source_name);
-    } else {
-      std::string backup_name =
-          absl::StrCat(path_stem, ".", (i + 1), ".wtf-trace");
-      status = file_io::MoveFile(source_name, backup_name);
-    }
-    if (!status.ok()) {
-      LOG(WARNING) << "Could not remove backup trace file " << source_name
-                   << ": " << status;
-    }
-  }
+void iree_tracing_set_thread_name_impl(const char* name) {
+  tracy::SetThreadName(name);
 }
 
-// Flushes all recorded trace data since the last flush.
-void FlushTraceFile(absl::optional<absl::string_view> explicit_trace_path)
-    ABSL_EXCLUSIVE_LOCKS_REQUIRED(global_tracing_mutex) {
-  if (!global_tracing_initialized) return;
+iree_zone_id_t iree_tracing_zone_begin_impl(
+    const struct ___tracy_source_location_data* src_loc, const char* name,
+    size_t name_length) {
+  const iree_zone_id_t zone_id = tracy::GetProfiler().GetNextZoneId();
 
-  static std::string* current_trace_path = nullptr;
-  static ::wtf::Runtime::SaveCheckpoint checkpoint;
-  static bool is_first_flush = false;
+#ifndef TRACY_NO_VERIFY
+  {
+    TracyLfqPrepareC(tracy::QueueType::ZoneValidation);
+    tracy::MemWrite(&item->zoneValidation.id, zone_id);
+    TracyLfqCommitC;
+  }
+#endif  // TRACY_NO_VERIFY
 
-  // Detect whether explicitly overriding the trace file.
-  if (explicit_trace_path) {
-    if (!current_trace_path || *current_trace_path != *explicit_trace_path) {
-      // Reset.
-      delete current_trace_path;
-      current_trace_path = new std::string(*explicit_trace_path);
+  {
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+    TracyLfqPrepareC(tracy::QueueType::ZoneBeginCallstack);
+#else
+    TracyLfqPrepareC(tracy::QueueType::ZoneBegin);
+#endif  // IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+    tracy::MemWrite(&item->zoneBegin.time, tracy::Profiler::GetTime());
+    tracy::MemWrite(&item->zoneBegin.srcloc,
+                    reinterpret_cast<uint64_t>(src_loc));
+    TracyLfqCommitC;
+  }
 
-      // Trigger first flush semantics.
-      is_first_flush = true;
-      checkpoint = ::wtf::Runtime::SaveCheckpoint();
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+  tracy::GetProfiler().SendCallstack(IREE_TRACING_MAX_CALLSTACK_DEPTH);
+#endif  // IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+
+  if (name_length) {
+#ifndef TRACY_NO_VERIFY
+    {
+      TracyLfqPrepareC(tracy::QueueType::ZoneValidation);
+      tracy::MemWrite(&item->zoneValidation.id, zone_id);
+      TracyLfqCommitC;
     }
-  } else if (!current_trace_path) {
-    // Resolve implicitly from flags.
-    const auto& implicit_trace_path = absl::GetFlag(FLAGS_iree_trace_file);
-    if (!implicit_trace_path.empty()) {
-      current_trace_path = new std::string(implicit_trace_path);
-      // Trigger first flush semantics.
-      is_first_flush = true;
-      checkpoint = ::wtf::Runtime::SaveCheckpoint();
-    }
+#endif  // TRACY_NO_VERIFY
+    auto name_ptr =
+        reinterpret_cast<char*>(tracy::tracy_malloc(name_length + 1));
+    memcpy(name_ptr, name, name_length);
+    name_ptr[name_length] = '\0';
+    TracyLfqPrepareC(tracy::QueueType::ZoneName);
+    tracy::MemWrite(&item->zoneText.text, reinterpret_cast<uint64_t>(name_ptr));
+    TracyLfqCommitC;
   }
 
-  if (!current_trace_path) {
-    return;
-  }
-
-  if (is_first_flush) {
-    // Backup existing any existing trace files at the specified path.
-    RollTraceFiles(*current_trace_path);
-  }
-
-  auto save_options =
-      ::wtf::Runtime::SaveOptions::ForStreamingFile(&checkpoint);
-  if (is_first_flush) {
-    // On the first time, truncate the file. All subsequent flushes append.
-    save_options.open_mode = std::ios_base::trunc;
-  }
-
-  is_first_flush = false;
-
-  auto* runtime = ::wtf::Runtime::GetInstance();
-  if (!runtime->SaveToFile(*current_trace_path, save_options)) {
-    LOG(ERROR) << "Error saving WTF file: " << *current_trace_path;
-    return;
-  }
-
-  VLOG(1) << "Flushed WTF trace to: " << *current_trace_path;
+  return zone_id;
 }
 
-}  // namespace
+iree_zone_id_t iree_tracing_zone_begin_external_impl(
+    const char* file_name, size_t file_name_length, uint32_t line,
+    const char* function_name, size_t function_name_length, const char* name,
+    size_t name_length) {
+  // NOTE: cloned from tracy::Profiler::AllocSourceLocation so that we can use
+  // the string lengths we already have.
+  const uint32_t src_loc_length =
+      static_cast<uint32_t>(4 + 4 + 4 + function_name_length + 1 +
+                            file_name_length + 1 + name_length);
+  auto ptr = reinterpret_cast<char*>(tracy::tracy_malloc(src_loc_length));
+  memcpy(ptr, &src_loc_length, 4);
+  memset(ptr + 4, 0, 4);
+  memcpy(ptr + 8, &line, 4);
+  memcpy(ptr + 12, function_name, function_name_length + 1);
+  memcpy(ptr + 12 + function_name_length + 1, file_name, file_name_length + 1);
+  if (name_length) {
+    memcpy(ptr + 12 + function_name_length + 1 + file_name_length + 1, name,
+           name_length);
+  }
+  uint64_t src_loc = reinterpret_cast<uint64_t>(ptr);
 
-void InitializeTracing() {
-  if (!::wtf::kMasterEnable) {
-    if (!absl::GetFlag(FLAGS_iree_trace_file).empty()) {
-      LOG(WARNING) << "WTF trace save requested but WTF is not compiled in. "
-                   << "Enable by building with --define=GLOBAL_WTF_ENABLE=1.";
-    }
-    return;
+  const iree_zone_id_t zone_id = tracy::GetProfiler().GetNextZoneId();
+
+#ifndef TRACY_NO_VERIFY
+  {
+    TracyLfqPrepareC(tracy::QueueType::ZoneValidation);
+    tracy::MemWrite(&item->zoneValidation.id, zone_id);
+    TracyLfqCommitC;
+  }
+#endif  // TRACY_NO_VERIFY
+
+  {
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+    TracyLfqPrepareC(tracy::QueueType::ZoneBeginAllocSrcLocCallstack);
+#else
+    TracyLfqPrepareC(tracy::QueueType::ZoneBeginAllocSrcLoc);
+#endif  // IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+    tracy::MemWrite(&item->zoneBegin.time, tracy::Profiler::GetTime());
+    tracy::MemWrite(&item->zoneBegin.srcloc, src_loc);
+    TracyLfqCommitC;
   }
 
-  absl::MutexLock lock(&global_tracing_mutex);
-  if (global_tracing_initialized) return;
-  global_tracing_initialized = true;
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
+  tracy::GetProfiler().SendCallstack(IREE_TRACING_MAX_CALLSTACK_DEPTH);
+#endif  // IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
 
-  // Enable tracing on this thread, which we know is main.
-  IREE_TRACE_THREAD_ENABLE("main");
-
-  // Register atexit callback to stop tracking.
-  atexit(StopTracing);
-
-  LOG(INFO) << "Tracing enabled and streaming to: "
-            << absl::GetFlag(FLAGS_iree_trace_file);
-
-  // Launch a thread to periodically flush the trace.
-  if (absl::GetFlag(FLAGS_iree_trace_file_period) > 0 &&
-      !absl::GetFlag(FLAGS_iree_trace_file).empty()) {
-    absl::Duration period =
-        absl::Seconds(absl::GetFlag(FLAGS_iree_trace_file_period));
-    StartTracingAutoFlush(period);
-  }
+  return zone_id;
 }
 
-bool IsTracingAvailable() { return ::wtf::kMasterEnable; }
-
-void StartTracingAutoFlush(absl::Duration period) {
-  static std::thread flush_thread = ([period]() -> std::thread {
-    flush_thread = std::thread([period]() {
-      while (true) {
-        absl::SleepFor(period);
-        absl::MutexLock lock(&global_tracing_mutex);
-        if (!global_tracing_initialized) {
-          return;
-        }
-        FlushTraceFile(absl::optional<absl::string_view>());
-      }
-    });
-    flush_thread.detach();
-    return std::move(flush_thread);
-  })();
+void iree_tracing_set_plot_type_impl(const char* name_literal,
+                                     uint8_t plot_type) {
+  tracy::Profiler::ConfigurePlot(name_literal,
+                                 static_cast<tracy::PlotFormatType>(plot_type));
 }
 
-// Stops tracing if currently initialized.
-void StopTracing() {
-  if (!::wtf::kMasterEnable) return;
-  absl::MutexLock lock(&global_tracing_mutex);
-  if (!global_tracing_initialized) return;
-
-  // Flush any pending trace data.
-  FlushTraceFile(absl::optional<absl::string_view>());
-
-  // Mark WTF as uninitialized to kill the flush thread.
-  global_tracing_initialized = false;
-
-  LOG(INFO) << "Tracing stopped and flushed to file: "
-            << absl::GetFlag(FLAGS_iree_trace_file);
+void iree_tracing_plot_value_i64_impl(const char* name_literal, int64_t value) {
+  tracy::Profiler::PlotData(name_literal, value);
 }
 
-void FlushTrace(absl::optional<absl::string_view> explicit_trace_path) {
-  if (!::wtf::kMasterEnable) return;
-  absl::MutexLock lock(&global_tracing_mutex);
-  if (!global_tracing_initialized) return;
-  FlushTraceFile(explicit_trace_path);
+void iree_tracing_plot_value_f32_impl(const char* name_literal, float value) {
+  tracy::Profiler::PlotData(name_literal, value);
 }
 
-}  // namespace iree
+void iree_tracing_plot_value_f64_impl(const char* name_literal, double value) {
+  tracy::Profiler::PlotData(name_literal, value);
+}
 
-IREE_DECLARE_MODULE_INITIALIZER(iree_tracing);
+#endif  // IREE_TRACING_FEATURES
 
-IREE_REGISTER_MODULE_INITIALIZER(iree_tracing, ::iree::InitializeTracing());
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/iree/base/tracing.h b/iree/base/tracing.h
index a37b835..169393c 100644
--- a/iree/base/tracing.h
+++ b/iree/base/tracing.h
@@ -12,93 +12,412 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Utilities for profiling and tracing.
-// These attempt to support the various tools we use in a way that scales better
-// than one annotation per tool per site and ensures things stay consistent and
-// easy to correlate across tools.
+// Utilities for runtime tracing support.
+// These allow the various runtime subsystems to insert trace events, attach
+// metadata to events or allocations, and control tracing verbosity.
 //
-// Tracing with WTF:
-// - build with --define=GLOBAL_WTF_ENABLE=1
-// - pass --iree_trace_file=/tmp/foo.wtf-trace when running
-// - view trace in WTF UI
+// Tracing features can be enabled with either an IREE_TRACING_MODE define that
+// allows predefined tracing modes or individual IREE_TRACING_FEATURE_* flags
+// set on IREE_TRACING_FEATURES when a more custom set of features is
+// required. Exact feature support may vary on platform and toolchain.
 //
-// If GLOBAL_WTF_ENABLE=1 is specified WTF will automatically be initialized on
-// startup and flushed on exit.
+// The tracing infrastructure is currently designed to target the Tracy
+// profiler: https://github.com/wolfpld/tracy
+// Tracy's profiler UI allowing for streaming captures and analysis can be
+// downloaded from: https://github.com/wolfpld/tracy/releases
+// The manual provided on the releases page contains more information about how
+// Tracy works, its limitations, and how to operate the UI.
+//
+// NOTE: this header is used both from C and C++ code and only conditionally
+// enables the C++ when in a valid context. Do not use C++ features or include
+// other files that are not C-compatible.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "absl/base/attributes.h"
 
 #ifndef IREE_BASE_TRACING_H_
 #define IREE_BASE_TRACING_H_
 
-#include "absl/strings/string_view.h"
-#include "absl/time/time.h"
-#include "absl/types/optional.h"
+//===----------------------------------------------------------------------===//
+// IREE_TRACING_FEATURE_* flags and options
+//===----------------------------------------------------------------------===//
 
-#if defined(WTF_ENABLE)
+// Enables IREE_TRACE_* macros for instrumented tracing.
+#define IREE_TRACING_FEATURE_INSTRUMENTATION (1 << 0)
 
-#include "wtf/event.h"   // IWYU pragma: export
-#include "wtf/macros.h"  // IWYU pragma: export
+// Captures callstacks up to IREE_TRACING_MAX_CALLSTACK_DEPTH at all
+// IREE_TRACE_* events. This has a significant performance impact and should
+// only be enabled when tracking down missing instrumentation.
+#define IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS (1 << 1)
 
-namespace iree {
+// Tracks all allocations (we know about) via new/delete/malloc/free.
+// This allows fine-grained allocation and usage tracking down to the code that
+// performed the allocations. Allocations or frees that are performed outside of
+// the IREE API or runtime library will not be tracked and unbalanced usage
+// (allocating with IREE's API then freeing with stdlib free, for example) will
+// cause Tracy to become very unhappy.
+#define IREE_TRACING_FEATURE_ALLOCATION_TRACKING (1 << 2)
 
-// Initializes tracing if it is built into the binary.
-// Does nothing if already initialized.
-void InitializeTracing();
+// Captures callstacks up to IREE_TRACING_MAX_CALLSTACK_DEPTH at all allocation
+// events when allocation tracking is enabled.
+#define IREE_TRACING_FEATURE_ALLOCATION_CALLSTACKS (1 << 3)
 
-// Returns whether tracing support is compiled into the binary.
-bool IsTracingAvailable();
+#if !defined(IREE_TRACING_MAX_CALLSTACK_DEPTH)
+// Tracing functions that capture stack traces will only capture up to N frames.
+// The overhead for stack walking scales linearly with the number of frames
+// captured and can increase the cost of an event capture by orders of
+// magnitude.
+// Minimum: 0 (disable)
+// Maximum: 62
+#define IREE_TRACING_MAX_CALLSTACK_DEPTH 16
+#endif  // IREE_TRACING_MAX_CALLSTACK_DEPTH
 
-// Starts a background auto flush thread (if not already started). This will
-// cause the trace file to be appended to at the given period.
-void StartTracingAutoFlush(absl::Duration period);
+//===----------------------------------------------------------------------===//
+// IREE_TRACING_MODE simple setting
+//===----------------------------------------------------------------------===//
 
-// Stops tracing and flushes any pending data.
-void StopTracing();
+// Set IREE_TRACING_FEATURES based on IREE_TRACING_MODE if the user hasn't
+// overridden it with more specific settings.
+//
+// IREE_TRACING_MODE = 0: tracing disabled
+// IREE_TRACING_MODE = 1: instrumentation and basic statistics
+// IREE_TRACING_MODE = 2: same as 1 with added allocation tracking
+// IREE_TRACING_MODE = 3: same as 2 with callstacks for allocations
+// IREE_TRACING_MODE = 3: same as 3 with callstacks for all instrumentation
+#if !defined(IREE_TRACING_FEATURES)
+#if defined(IREE_TRACING_MODE) && IREE_TRACING_MODE == 1
+#define IREE_TRACING_FEATURES (IREE_TRACING_FEATURE_INSTRUMENTATION)
+#undef IREE_TRACING_MAX_CALLSTACK_DEPTH
+#define IREE_TRACING_MAX_CALLSTACK_DEPTH 0
+#elif defined(IREE_TRACING_MODE) && IREE_TRACING_MODE == 2
+#define IREE_TRACING_FEATURES             \
+  (IREE_TRACING_FEATURE_INSTRUMENTATION | \
+   IREE_TRACING_FEATURE_ALLOCATION_TRACKING)
+#elif defined(IREE_TRACING_MODE) && IREE_TRACING_MODE == 3
+#define IREE_TRACING_FEATURES                 \
+  (IREE_TRACING_FEATURE_INSTRUMENTATION |     \
+   IREE_TRACING_FEATURE_ALLOCATION_TRACKING | \
+   IREE_TRACING_FEATURE_ALLOCATION_CALLSTACKS)
+#elif defined(IREE_TRACING_MODE) && IREE_TRACING_MODE >= 4
+#define IREE_TRACING_FEATURES                        \
+  (IREE_TRACING_FEATURE_INSTRUMENTATION |            \
+   IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS | \
+   IREE_TRACING_FEATURE_ALLOCATION_TRACKING |        \
+   IREE_TRACING_FEATURE_ALLOCATION_CALLSTACKS)
+#else
+#define IREE_TRACING_FEATURES 0
+#endif  // IREE_TRACING_MODE
+#endif  // !IREE_TRACING_FEATURES
 
-// Flushes pending trace data to disk, if enabled.
-void FlushTrace(absl::optional<absl::string_view> explicit_trace_path =
-                    absl::optional<absl::string_view>());
+//===----------------------------------------------------------------------===//
+// Tracy configuration
+//===----------------------------------------------------------------------===//
+// NOTE: order matters here as we are including files that require/define.
 
-// Enables the current thread for WTF profiling/tracing.
-#define IREE_TRACE_THREAD_ENABLE(name) WTF_THREAD_ENABLE(name);
+// Enable Tracy only when we are using tracing features.
+#if IREE_TRACING_FEATURES != 0
+#define TRACY_ENABLE 1
+#endif  // IREE_TRACING_FEATURES
 
-// Tracing scope that emits WTF tracing scopes depending on whether
-// profiling/tracing are enabled.
-// See WTF_SCOPE0 for more information.
-#define IREE_TRACE_SCOPE0(name_spec) WTF_SCOPE0(name_spec);
+// Disable zone nesting verification in release builds.
+// The verification makes it easy to find unbalanced zones but doubles the cost
+// (at least) of each zone recorded. Run in debug builds to verify new
+// instrumentation is correct before capturing traces in release builds.
+#if defined(NDEBUG)
+#define TRACY_NO_VERIFY 1
+#endif  // NDEBUG
 
-// Tracing scope that emits WTF tracing scopes with additional
-// arguments depending on whether profiling/tracing is enabled.
-// See WTF_SCOPE for more information.
-#define IREE_TRACE_SCOPE(name_spec, ...) WTF_SCOPE(name_spec, __VA_ARGS__)
+// Force callstack capture on all zones (even those without the C suffix).
+#if (IREE_TRACING_FEATURES &                             \
+     IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS) || \
+    (IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_CALLSTACKS)
+#define TRACY_CALLSTACK 1
+#endif  // IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS
 
-// Tracing event that emits a WTF event.
-// See WTF_EVENT0 for more information.
-#define IREE_TRACE_EVENT0 WTF_EVENT0
+// TODO(#1926): upstream a TRACY_NO_FRAME_IMAGE flag to remove the frame
+// compression thread and dxt1 compression code.
 
-// Tracing event that emits a WTF event with additional arguments.
-// See WTF_EVENT for more information.
-#define IREE_TRACE_EVENT WTF_EVENT
+// Flush the settings we have so far; settings after this point will be
+// overriding values set by Tracy itself.
+#if defined(TRACY_ENABLE)
+#include "third_party/tracy/TracyC.h"  // IWYU pragma: export
+#endif
 
-}  // namespace iree
+// Disable callstack capture if our depth is 0; this allows us to avoid any
+// expensive capture (and all the associated dependencies) if we aren't going to
+// use it. Note that this means that unless code is instrumented we won't be
+// able to tell what's happening in the Tracy UI.
+#if IREE_TRACING_MAX_CALLSTACK_DEPTH == 0
+#undef TRACY_HAS_CALLSTACK
+#endif  // IREE_TRACING_MAX_CALLSTACK_DEPTH
+
+//===----------------------------------------------------------------------===//
+// C API used for Tracy control
+//===----------------------------------------------------------------------===//
+// These functions are implementation details and should not be called directly.
+// Always use the macros (or C++ RAII types).
+
+// Local zone ID used for the C IREE_TRACE_ZONE_* macros.
+typedef uint32_t iree_zone_id_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#if IREE_TRACING_FEATURES
+
+void iree_tracing_set_thread_name_impl(const char* name);
+
+ABSL_MUST_USE_RESULT iree_zone_id_t iree_tracing_zone_begin_impl(
+    const struct ___tracy_source_location_data* src_loc, const char* name,
+    size_t name_length);
+ABSL_MUST_USE_RESULT iree_zone_id_t iree_tracing_zone_begin_external_impl(
+    const char* file_name, size_t file_name_length, uint32_t line,
+    const char* function_name, size_t function_name_length, const char* name,
+    size_t name_length);
+
+void iree_tracing_set_plot_type_impl(const char* name_literal,
+                                     uint8_t plot_type);
+void iree_tracing_plot_value_i64_impl(const char* name_literal, int64_t value);
+void iree_tracing_plot_value_f32_impl(const char* name_literal, float value);
+void iree_tracing_plot_value_f64_impl(const char* name_literal, double value);
+
+#endif  // IREE_TRACING_FEATURES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Instrumentation macros (C)
+//===----------------------------------------------------------------------===//
+
+// Matches Tracy's PlotFormatType enum.
+enum {
+  // Values will be displayed as plain numbers.
+  IREE_TRACING_PLOT_TYPE_NUMBER = 0,
+  // Treats the values as memory sizes. Will display kilobytes, megabytes, etc.
+  IREE_TRACING_PLOT_TYPE_MEMORY = 1,
+  // Values will be displayed as percentage with value 100 being equal to 100%.
+  IREE_TRACING_PLOT_TYPE_PERCENTAGE = 2,
+};
+
+// Colors used for messages based on the level provided to the macro.
+enum {
+  IREE_TRACING_MESSAGE_LEVEL_ERROR = 0xFF0000u,
+  IREE_TRACING_MESSAGE_LEVEL_WARNING = 0xFFFF00u,
+  IREE_TRACING_MESSAGE_LEVEL_INFO = 0xFFFFFFu,
+  IREE_TRACING_MESSAGE_LEVEL_VERBOSE = 0xC0C0C0u,
+  IREE_TRACING_MESSAGE_LEVEL_DEBUG = 0x00FF00u,
+};
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+// Sets an application-specific payload that will be stored in the trace.
+// This can be used to fingerprint traces to particular versions and denote
+// compilation options or configuration. The given string value will be copied.
+#define IREE_TRACE_SET_APP_INFO(value, value_length) \
+  ___tracy_emit_message_appinfo(value, value_length)
+
+// Sets the current thread name to the given string value.
+// This will only set the thread name as it appears in the tracing backend and
+// not set the OS thread name as it would appear in a debugger.
+// The C-string |name| will be copied and does not need to be a literal.
+#define IREE_TRACE_SET_THREAD_NAME(name) iree_tracing_set_thread_name_impl(name)
+
+// Begins a new zone with the parent function name.
+#define IREE_TRACE_ZONE_BEGIN(zone_id) \
+  IREE_TRACE_ZONE_BEGIN_NAMED(zone_id, NULL)
+
+// Begins a new zone with the given compile-time literal name.
+#define IREE_TRACE_ZONE_BEGIN_NAMED(zone_id, name_literal)                    \
+  static const struct ___tracy_source_location_data TracyConcat(              \
+      __tracy_source_location, __LINE__) = {name_literal, __FUNCTION__,       \
+                                            __FILE__, (uint32_t)__LINE__, 0}; \
+  iree_zone_id_t zone_id = iree_tracing_zone_begin_impl(                      \
+      &TracyConcat(__tracy_source_location, __LINE__), NULL, 0);
+
+// Begins a new zone with the given runtime dynamic string name.
+// The |value| string will be copied into the trace buffer.
+#define IREE_TRACE_ZONE_BEGIN_NAMED_DYNAMIC(zone_id, name, name_length)   \
+  static const struct ___tracy_source_location_data TracyConcat(          \
+      __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \
+                                            (uint32_t)__LINE__, 0};       \
+  iree_zone_id_t zone_id = iree_tracing_zone_begin_impl(                  \
+      &TracyConcat(__tracy_source_location, __LINE__), name, name_length);
+
+// Begins an externally defined zone with a dynamic source location.
+// The |file_name|, |function_name|, and optional |name| strings will be copied
+// into the trace buffer and do not need to persist.
+#define IREE_TRACE_ZONE_BEGIN_EXTERNAL(                                       \
+    zone_id, file_name, file_name_length, line, function_name,                \
+    function_name_length, name, name_length)                                  \
+  iree_zone_id_t zone_id = iree_tracing_zone_begin_external_impl(             \
+      file_name, file_name_length, line, function_name, function_name_length, \
+      name, name_length)
+
+// Appends a string value to the parent zone. May be called multiple times.
+// The |value| string will be copied into the trace buffer.
+#define IREE_TRACE_ZONE_APPEND_TEXT(...)                                  \
+  IREE_TRACE_IMPL_GET_VARIADIC_((__VA_ARGS__,                             \
+                                 IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW, \
+                                 IREE_TRACE_ZONE_APPEND_TEXT_CSTRING))    \
+  (__VA_ARGS__)
+#define IREE_TRACE_ZONE_APPEND_TEXT_CSTRING(zone_id, value) \
+  IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(zone_id, value, strlen(value))
+#define IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(zone_id, value, value_length)  \
+  ___tracy_emit_zone_text((struct ___tracy_c_zone_context){zone_id, 1}, value, \
+                          value_length)
+
+// Ends the current zone. Must be passed the |zone_id| from the _BEGIN.
+#define IREE_TRACE_ZONE_END(zone_id) \
+  ___tracy_emit_zone_end((struct ___tracy_c_zone_context){zone_id, 1})
+
+// Configures the named plot with an IREE_TRACING_PLOT_TYPE_* representation.
+#define IREE_TRACE_SET_PLOT_TYPE(name_literal, plot_type) \
+  iree_tracing_set_plot_type_impl(name_literal, plot_type)
+// Plots a value in the named plot group as an integer.
+#define IREE_TRACE_PLOT_VALUE_I64(name_literal, value) \
+  iree_tracing_plot_value_i64_impl(name_literal, value)
+// Plots a value in the named plot group as a single-precision float.
+#define IREE_TRACE_PLOT_VALUE_F32(name_literal, value) \
+  iree_tracing_plot_value_f32_impl(name_literal, value)
+// Plots a value in the named plot group as a double-precision float.
+#define IREE_TRACE_PLOT_VALUE_F64(name_literal, value) \
+  iree_tracing_plot_value_f64_impl(name_literal, value)
+
+// Demarcates an advancement of the top-level unnamed frame group.
+#define IREE_TRACE_FRAME_MARK() ___tracy_emit_frame_mark(NULL)
+// Demarcates an advancement of a named frame group.
+#define IREE_TRACE_FRAME_MARK_NAMED(name_literal) \
+  ___tracy_emit_frame_mark(name_literal)
+// Begins a discontinuous frame in a named frame group.
+// Must be properly matched with a IREE_TRACE_FRAME_MARK_NAMED_END.
+#define IREE_TRACE_FRAME_MARK_BEGIN_NAMED(name_literal) \
+  ___tracy_emit_frame_mark_start(name_literal)
+// Ends a discontinuous frame in a named frame group.
+#define IREE_TRACE_FRAME_MARK_END_NAMED(name_literal) \
+  ___tracy_emit_frame_mark_end(name_literal)
+
+// Logs a message at the given logging level to the trace.
+// The message text must be a compile-time string literal.
+#define IREE_TRACE_MESSAGE(level, value_literal) \
+  ___tracy_emit_messageLC(value_literal, IREE_TRACING_MESSAGE_LEVEL_##level, 0)
+// Logs a dynamically-allocated message at the given logging level to the trace.
+// The string |value| will be copied into the trace buffer.
+#define IREE_TRACE_MESSAGE_DYNAMIC(level, value, value_length) \
+  ___tracy_emit_messageC(value, value_length,                  \
+                         IREE_TRACING_MESSAGE_LEVEL_##level, 0)
+
+// Utilities:
+#define IREE_TRACE_IMPL_GET_VARIADIC_HELPER_(_1, _2, _3, NAME, ...) NAME
+#define IREE_TRACE_IMPL_GET_VARIADIC_(args) \
+  IREE_TRACE_IMPL_GET_VARIADIC_HELPER_ args
+
+#else
+#define IREE_TRACE_SET_APP_INFO(value, value_length)
+#define IREE_TRACE_SET_THREAD_NAME(name)
+#define IREE_TRACE_ZONE_BEGIN(zone_id)
+#define IREE_TRACE_ZONE_BEGIN_NAMED(zone_id, name_literal)
+#define IREE_TRACE_ZONE_BEGIN_NAMED_DYNAMIC(zone_id, name, name_length)
+#define IREE_TRACE_ZONE_BEGIN_EXTERNAL(                        \
+    zone_id, file_name, file_name_length, line, function_name, \
+    function_name_length, name, name_length)
+#define IREE_TRACE_ZONE_APPEND_TEXT(zone_id, value, value_length)
+#define IREE_TRACE_ZONE_END(zone_id)
+#define IREE_TRACE_SET_PLOT_TYPE(name_literal, plot_type)
+#define IREE_TRACE_PLOT_VALUE_I64(name_literal, value)
+#define IREE_TRACE_PLOT_VALUE_F32(name_literal, value)
+#define IREE_TRACE_PLOT_VALUE_F64(name_literal, value)
+#define IREE_TRACE_FRAME_MARK()
+#define IREE_TRACE_FRAME_MARK_NAMED(name_literal)
+#define IREE_TRACE_FRAME_MARK_BEGIN_NAMED(name_literal)
+#define IREE_TRACE_FRAME_MARK_END_NAMED(name_literal)
+#define IREE_TRACE_MESSAGE(level, value_literal)
+#define IREE_TRACE_MESSAGE_DYNAMIC(level, value, value_length)
+#endif  // IREE_TRACING_FEATURE_INSTRUMENTATION
+
+//===----------------------------------------------------------------------===//
+// Allocation tracking macros (C/C++)
+//===----------------------------------------------------------------------===//
+//
+// IREE_TRACE_ALLOC: records an malloc.
+// IREE_TRACE_FREE: records a free.
+//
+// NOTE: realloc must be recorded as a FREE/ALLOC pair.
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_CALLSTACKS
+
+#define IREE_TRACE_ALLOC(ptr, size)               \
+  ___tracy_emit_memory_alloc_callstack(ptr, size, \
+                                       IREE_TRACING_MAX_CALLSTACK_DEPTH)
+#define IREE_TRACE_FREE(ptr) \
+  ___tracy_emit_memory_free_callstack(ptr, IREE_TRACING_MAX_CALLSTACK_DEPTH)
 
 #else
 
-namespace iree {
+#define IREE_TRACE_ALLOC(ptr, size) ___tracy_emit_memory_alloc(ptr, size)
+#define IREE_TRACE_FREE(ptr) ___tracy_emit_memory_free(ptr)
 
-inline void InitializeTracing() {}
-inline bool IsTracingAvailable() { return false; }
-inline void StartTracingAutoFlush(absl::Duration period) {}
-inline void StopTracing() {}
-inline void FlushTrace(absl::optional<absl::string_view> explicit_trace_path =
-                           absl::optional<absl::string_view>()) {}
+#endif  // IREE_TRACING_FEATURE_ALLOCATION_CALLSTACKS
 
+#else
+#define IREE_TRACE_ALLOC(ptr, size)
+#define IREE_TRACE_FREE(ptr)
+#endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+
+#ifdef __cplusplus
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+
+inline void* operator new(size_t count) {
+  auto ptr = malloc(count);
+  IREE_TRACE_ALLOC(ptr, count);
+  return ptr;
+}
+
+inline void operator delete(void* ptr) noexcept {
+  IREE_TRACE_FREE(ptr);
+  free(ptr);
+}
+
+#endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Instrumentation C++ RAII types, wrappers, and macros
+//===----------------------------------------------------------------------===//
+
+#ifdef __cplusplus
+
+#if defined(TRACY_ENABLE)
+#include "third_party/tracy/Tracy.hpp"  // IWYU pragma: export
+#endif
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+// TODO(#1886): update these to tracy and drop the 0.
+#define IREE_TRACE_SCOPE0(name_spec) ZoneScopedNS(name_spec, 13)
+#define IREE_TRACE_SCOPE(name_spec, ...)
+#define IREE_TRACE_EVENT0
+#define IREE_TRACE_EVENT
+
+#else
 #define IREE_TRACE_THREAD_ENABLE(name)
 #define IREE_TRACE_SCOPE0(name_spec)
 #define IREE_TRACE_SCOPE(name_spec, ...) (void)
 #define IREE_TRACE_EVENT0
-#define IREE_TRACE_EVENT (void)
+#define IREE_TRACE_EVENT(void)
+#endif  // IREE_TRACING_FEATURE_INSTRUMENTATION
 
-}  // namespace iree
+// TODO(benvanik): macros for LockableCtx / Lockable mutex tracking.
 
-#endif  // GLOBAL_WTF_ENABLE
+#endif  // __cplusplus
 
 #endif  // IREE_BASE_TRACING_H_
diff --git a/iree/base/tracing_disabled.cc b/iree/base/tracing_disabled.cc
deleted file mode 100644
index 96ea115..0000000
--- a/iree/base/tracing_disabled.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is linked in only when WTF is not enabled. It allows us to keep the
-// same flags and functions without needing to do a bunch of ifdef hackery or
-// undefok mangling.
-
-#include <cstdint>
-#include <string>
-
-#include "absl/flags/flag.h"
-#include "iree/base/tracing.h"
-
-// TODO(benvanik): remove this when disabled so that we don't dep on flags.
-ABSL_FLAG(int32_t, iree_trace_file_period, 0,
-          "Flag for tracing. Use --define=GLOBAL_WTF_ENABLE=1 to enable WTF.");
-ABSL_FLAG(std::string, iree_trace_file, "",
-          "Flag for tracing. Use --define=GLOBAL_WTF_ENABLE=1 to enable WTF.");
diff --git a/iree/hal/host/async_command_queue.cc b/iree/hal/host/async_command_queue.cc
index c1c2b73..e964379 100644
--- a/iree/hal/host/async_command_queue.cc
+++ b/iree/hal/host/async_command_queue.cc
@@ -48,8 +48,7 @@
 }
 
 void AsyncCommandQueue::ThreadMain() {
-  // TODO(benvanik): make this safer (may die if trace is flushed late).
-  IREE_TRACE_THREAD_ENABLE(target_queue_->name().c_str());
+  IREE_TRACE_SET_THREAD_NAME(target_queue_->name().c_str());
 
   bool is_exiting = false;
   while (!is_exiting) {
diff --git a/iree/hal/host/host_local_command_processor.cc b/iree/hal/host/host_local_command_processor.cc
index 9d7ed31..a338e94 100644
--- a/iree/hal/host/host_local_command_processor.cc
+++ b/iree/hal/host/host_local_command_processor.cc
@@ -24,9 +24,9 @@
 namespace hal {
 
 HostLocalCommandProcessor::HostLocalCommandProcessor(
-    Allocator* allocator, CommandBufferModeBitfield mode,
-    CommandCategoryBitfield command_categories)
-    : CommandBuffer(allocator, mode, command_categories) {}
+    Allocator* allocator, CommandCategoryBitfield command_categories)
+    : CommandBuffer(allocator, CommandBufferMode::kOneShot,
+                    command_categories) {}
 
 HostLocalCommandProcessor::~HostLocalCommandProcessor() = default;
 
diff --git a/iree/hal/host/host_local_command_processor.h b/iree/hal/host/host_local_command_processor.h
index 70e3ef3..83a40ef 100644
--- a/iree/hal/host/host_local_command_processor.h
+++ b/iree/hal/host/host_local_command_processor.h
@@ -40,7 +40,6 @@
 class HostLocalCommandProcessor : public CommandBuffer {
  public:
   HostLocalCommandProcessor(Allocator* allocator,
-                            CommandBufferModeBitfield mode,
                             CommandCategoryBitfield command_categories);
   ~HostLocalCommandProcessor() override;
 
diff --git a/iree/hal/llvmjit/llvmjit_command_processor.cc b/iree/hal/llvmjit/llvmjit_command_processor.cc
index 36d99c5..ec83420 100644
--- a/iree/hal/llvmjit/llvmjit_command_processor.cc
+++ b/iree/hal/llvmjit/llvmjit_command_processor.cc
@@ -27,9 +27,8 @@
 namespace llvmjit {
 
 LLVMJITCommandProcessor::LLVMJITCommandProcessor(
-    Allocator* allocator, CommandBufferModeBitfield mode,
-    CommandCategoryBitfield command_categories)
-    : HostLocalCommandProcessor(allocator, mode, command_categories) {}
+    Allocator* allocator, CommandCategoryBitfield command_categories)
+    : HostLocalCommandProcessor(allocator, command_categories) {}
 
 LLVMJITCommandProcessor::~LLVMJITCommandProcessor() = default;
 
diff --git a/iree/hal/llvmjit/llvmjit_command_processor.h b/iree/hal/llvmjit/llvmjit_command_processor.h
index df38506..bb4ae74 100644
--- a/iree/hal/llvmjit/llvmjit_command_processor.h
+++ b/iree/hal/llvmjit/llvmjit_command_processor.h
@@ -22,7 +22,7 @@
 
 class LLVMJITCommandProcessor final : public HostLocalCommandProcessor {
  public:
-  LLVMJITCommandProcessor(Allocator* allocator, CommandBufferModeBitfield mode,
+  LLVMJITCommandProcessor(Allocator* allocator,
                           CommandCategoryBitfield command_categories);
   ~LLVMJITCommandProcessor() override;
 
diff --git a/iree/hal/llvmjit/llvmjit_device.cc b/iree/hal/llvmjit/llvmjit_device.cc
index 0926cb1..f3f6fab 100644
--- a/iree/hal/llvmjit/llvmjit_device.cc
+++ b/iree/hal/llvmjit/llvmjit_device.cc
@@ -84,8 +84,8 @@
     for (auto* command_buffer : command_buffers) {
       auto* inproc_command_buffer =
           static_cast<InProcCommandBuffer*>(command_buffer->impl());
-      LLVMJITCommandProcessor command_processor(
-          allocator_, command_buffer->mode(), supported_categories());
+      LLVMJITCommandProcessor command_processor(allocator_,
+                                                supported_categories());
       RETURN_IF_ERROR(inproc_command_buffer->Process(&command_processor));
     }
     return OkStatus();
diff --git a/iree/hal/vmla/BUILD b/iree/hal/vmla/BUILD
index 4d73ac0..88e99dd 100644
--- a/iree/hal/vmla/BUILD
+++ b/iree/hal/vmla/BUILD
@@ -84,6 +84,7 @@
         "//iree/hal/host:host_buffer",
         "//iree/hal/host:host_local_command_processor",
         "//iree/vm:invocation",
+        "//iree/vm:stack",
         "//iree/vm:variant_list",
     ],
 )
diff --git a/iree/hal/vmla/CMakeLists.txt b/iree/hal/vmla/CMakeLists.txt
index a82e361..7b1d000 100644
--- a/iree/hal/vmla/CMakeLists.txt
+++ b/iree/hal/vmla/CMakeLists.txt
@@ -85,6 +85,7 @@
     iree::hal::host::host_buffer
     iree::hal::host::host_local_command_processor
     iree::vm::invocation
+    iree::vm::stack
     iree::vm::variant_list
   PUBLIC
 )
diff --git a/iree/hal/vmla/vmla_cache.cc b/iree/hal/vmla/vmla_cache.cc
index 0d14aea..781acce 100644
--- a/iree/hal/vmla/vmla_cache.cc
+++ b/iree/hal/vmla/vmla_cache.cc
@@ -47,9 +47,9 @@
   // Wrap the data (or copy it).
   bool allow_aliasing_data =
       AllBitsSet(mode, ExecutableCachingMode::kAliasProvidedData);
-  ASSIGN_OR_RETURN(auto executable,
-                   VMLAExecutable::Load(instance_, vmla_module_, spec,
-                                        !allow_aliasing_data));
+  ASSIGN_OR_RETURN(
+      auto executable,
+      VMLAExecutable::Load(instance_, vmla_module_, spec, allow_aliasing_data));
 
   return executable;
 }
diff --git a/iree/hal/vmla/vmla_command_processor.cc b/iree/hal/vmla/vmla_command_processor.cc
index 6d305cb..8e2da12 100644
--- a/iree/hal/vmla/vmla_command_processor.cc
+++ b/iree/hal/vmla/vmla_command_processor.cc
@@ -28,11 +28,16 @@
 namespace vmla {
 
 VMLACommandProcessor::VMLACommandProcessor(
-    Allocator* allocator, CommandBufferModeBitfield mode,
-    CommandCategoryBitfield command_categories)
-    : HostLocalCommandProcessor(allocator, mode, command_categories) {}
+    Allocator* allocator, CommandCategoryBitfield command_categories)
+    : HostLocalCommandProcessor(allocator, command_categories) {
+  // TODO(#1172): embed the stack allocation within the command processor.
+  iree_allocator_malloc(IREE_ALLOCATOR_SYSTEM, sizeof(iree_vm_stack_t),
+                        (void**)&stack_);
+}
 
-VMLACommandProcessor::~VMLACommandProcessor() = default;
+VMLACommandProcessor::~VMLACommandProcessor() {
+  iree_allocator_free(IREE_ALLOCATOR_SYSTEM, stack_);
+}
 
 Status VMLACommandProcessor::DispatchInline(
     Executable* executable, int32_t entry_point,
@@ -64,12 +69,19 @@
     }
   }
 
-  return FromApiStatus(
-      iree_vm_invoke(vmla_executable->context(),
-                     vmla_executable->entry_functions()[entry_point],
-                     /*policy=*/nullptr, vmla_executable->interface_inputs(),
-                     /*outputs=*/nullptr, IREE_ALLOCATOR_SYSTEM),
-      IREE_LOC);
+  RETURN_IF_ERROR(FromApiStatus(
+      iree_vm_stack_init(
+          iree_vm_context_state_resolver(vmla_executable->context()), stack_),
+      IREE_LOC));
+  auto status =
+      FromApiStatus(iree_vm_invoke_within(
+                        vmla_executable->context(), stack_,
+                        vmla_executable->entry_functions()[entry_point],
+                        /*policy=*/nullptr, vmla_executable->interface_inputs(),
+                        /*outputs=*/nullptr),
+                    IREE_LOC);
+  iree_vm_stack_deinit(stack_);
+  return status;
 }
 
 }  // namespace vmla
diff --git a/iree/hal/vmla/vmla_command_processor.h b/iree/hal/vmla/vmla_command_processor.h
index 947ef79..7bf857d 100644
--- a/iree/hal/vmla/vmla_command_processor.h
+++ b/iree/hal/vmla/vmla_command_processor.h
@@ -16,6 +16,7 @@
 #define IREE_HAL_VMLA_VMLA_COMMAND_PROCESSOR_H_
 
 #include "iree/hal/host/host_local_command_processor.h"
+#include "iree/vm/stack.h"
 
 namespace iree {
 namespace hal {
@@ -23,7 +24,7 @@
 
 class VMLACommandProcessor final : public HostLocalCommandProcessor {
  public:
-  VMLACommandProcessor(Allocator* allocator, CommandBufferModeBitfield mode,
+  VMLACommandProcessor(Allocator* allocator,
                        CommandCategoryBitfield command_categories);
   ~VMLACommandProcessor() override;
 
@@ -33,6 +34,9 @@
       const PushConstantBlock& push_constants,
       absl::Span<const absl::Span<const DescriptorSet::Binding>> set_bindings)
       override;
+
+ private:
+  iree_vm_stack_t* stack_ = nullptr;
 };
 
 }  // namespace vmla
diff --git a/iree/hal/vmla/vmla_device.cc b/iree/hal/vmla/vmla_device.cc
index 0d48638..0e4313b 100644
--- a/iree/hal/vmla/vmla_device.cc
+++ b/iree/hal/vmla/vmla_device.cc
@@ -52,7 +52,8 @@
   UnsynchronizedCommandQueue(Allocator* allocator, std::string name,
                              CommandCategoryBitfield supported_categories)
       : CommandQueue(std::move(name), supported_categories),
-        allocator_(allocator) {}
+        allocator_(allocator),
+        command_processor_(allocator_, supported_categories_) {}
   ~UnsynchronizedCommandQueue() override = default;
 
   Status Submit(absl::Span<const SubmissionBatch> batches) override {
@@ -84,14 +85,13 @@
     for (auto* command_buffer : command_buffers) {
       auto* inproc_command_buffer =
           static_cast<InProcCommandBuffer*>(command_buffer->impl());
-      VMLACommandProcessor command_processor(allocator_, command_buffer->mode(),
-                                             supported_categories());
-      RETURN_IF_ERROR(inproc_command_buffer->Process(&command_processor));
+      RETURN_IF_ERROR(inproc_command_buffer->Process(&command_processor_));
     }
     return OkStatus();
   }
 
   Allocator* const allocator_;
+  VMLACommandProcessor command_processor_;
 };
 
 }  // namespace
diff --git a/iree/tools/BUILD b/iree/tools/BUILD
index 62d855a..f0d19cc 100644
--- a/iree/tools/BUILD
+++ b/iree/tools/BUILD
@@ -46,6 +46,7 @@
         "//iree/base:localfile",
         "//iree/base:source_location",
         "//iree/base:status",
+        "//iree/base:tracing",
         "//iree/modules/hal",
         "//iree/testing:benchmark_main",
         "//iree/vm:bytecode_module",
@@ -222,6 +223,7 @@
         "//iree/base:api",
         "//iree/base:api_util",
         "//iree/base:source_location",
+        "//iree/base:tracing",
         "//iree/compiler/Dialect/Flow/Transforms",
         "//iree/compiler/Dialect/HAL/Transforms",
         "//iree/compiler/Dialect/IREE/Transforms",
@@ -258,6 +260,7 @@
         "//iree/base:localfile",
         "//iree/base:source_location",
         "//iree/base:status",
+        "//iree/base:tracing",
         "//iree/modules/hal",
         "//iree/vm:bytecode_module",
     ] + PLATFORM_VULKAN_DEPS + IREE_DRIVER_MODULES,
diff --git a/iree/tools/CMakeLists.txt b/iree/tools/CMakeLists.txt
index eddbce5..ffc23bb 100644
--- a/iree/tools/CMakeLists.txt
+++ b/iree/tools/CMakeLists.txt
@@ -40,6 +40,7 @@
     iree::base::localfile
     iree::base::source_location
     iree::base::status
+    iree::base::tracing
     iree::modules::hal
     iree::testing::benchmark_main
     iree::vm::bytecode_module
@@ -80,6 +81,7 @@
     iree::base::localfile
     iree::base::source_location
     iree::base::status
+    iree::base::tracing
     iree::modules::hal
     iree::vm::bytecode_module
     ${IREE_HAL_DRIVER_MODULES}
@@ -270,6 +272,7 @@
       iree::base::init
       iree::base::source_location
       iree::base::status
+      iree::base::tracing
       iree::compiler::Dialect::Flow::Transforms
       iree::compiler::Dialect::HAL::Transforms
       iree::compiler::Dialect::IREE::Transforms
diff --git a/iree/tools/benchmark_module_main.cc b/iree/tools/benchmark_module_main.cc
index 844f6ad..8c0e3ee 100644
--- a/iree/tools/benchmark_module_main.cc
+++ b/iree/tools/benchmark_module_main.cc
@@ -19,6 +19,7 @@
 #include "iree/base/file_io.h"
 #include "iree/base/source_location.h"
 #include "iree/base/status.h"
+#include "iree/base/tracing.h"
 #include "iree/modules/hal/hal_module.h"
 #include "iree/tools/vm_util.h"
 #include "iree/vm/bytecode_module.h"
@@ -52,6 +53,7 @@
 namespace {
 
 StatusOr<std::string> GetModuleContentsFromFlags() {
+  IREE_TRACE_SCOPE0("GetModuleContentsFromFlags");
   auto input_file = absl::GetFlag(FLAGS_input_file);
   if (input_file.empty()) {
     return InvalidArgumentErrorBuilder(IREE_LOC)
@@ -61,6 +63,8 @@
 }
 
 Status Run(::benchmark::State& state) {
+  IREE_TRACE_SCOPE0("iree-benchmark-module");
+
   RETURN_IF_ERROR(FromApiStatus(iree_hal_module_register_types(), IREE_LOC))
       << "registering HAL types";
   iree_vm_instance_t* instance = nullptr;
diff --git a/iree/tools/run_mlir_main.cc b/iree/tools/run_mlir_main.cc
index bba7481..24e6fca 100644
--- a/iree/tools/run_mlir_main.cc
+++ b/iree/tools/run_mlir_main.cc
@@ -48,6 +48,7 @@
 #include "iree/base/init.h"
 #include "iree/base/source_location.h"
 #include "iree/base/status.h"
+#include "iree/base/tracing.h"
 #include "iree/compiler/Dialect/Flow/Transforms/Passes.h"
 #include "iree/compiler/Dialect/HAL/Transforms/Passes.h"
 #include "iree/compiler/Dialect/IREE/Transforms/Passes.h"
@@ -147,6 +148,7 @@
 
 // Returns a list of target compiler backends to use for file evaluation.
 StatusOr<std::vector<std::string>> GetTargetBackends() {
+  IREE_TRACE_SCOPE0("GetTargetBackends");
   auto target_backends =
       mlir::iree_compiler::IREE::HAL::getTargetOptionsFromFlags().targets;
   if (target_backends.empty()) {
@@ -170,6 +172,8 @@
 StatusOr<std::string> PrepareModule(
     std::string target_backend,
     std::unique_ptr<llvm::MemoryBuffer> file_buffer) {
+  IREE_TRACE_SCOPE0("PrepareModule");
+
   mlir::MLIRContext context;
 
   // Parse input MLIR module.
@@ -258,6 +262,8 @@
                         iree_hal_allocator_t* allocator,
                         iree_vm_function_t function,
                         absl::string_view export_name) {
+  IREE_TRACE_SCOPE0("EvaluateFunction");
+
   std::cout << "EXEC @" << export_name << std::endl;
   ASSIGN_OR_RETURN(auto input_descs, ParseInputSignature(function));
   auto input_values_list = absl::MakeConstSpan(
@@ -293,6 +299,8 @@
 Status EvaluateFunctions(iree_vm_instance_t* instance,
                          absl::string_view driver_name,
                          const std::string& flatbuffer_data) {
+  IREE_TRACE_SCOPE0("EvaluateFunctions");
+
   LOG(INFO) << "Evaluating all functions in module for driver '" << driver_name
             << "'...";
 
@@ -369,6 +377,8 @@
 
 // Translates and runs a single LLVM file buffer.
 Status EvaluateFile(std::unique_ptr<llvm::MemoryBuffer> file_buffer) {
+  IREE_TRACE_SCOPE0("EvaluateFile");
+
   // TODO(benvanik): move to instance-based registration.
   RETURN_IF_ERROR(FromApiStatus(iree_hal_module_register_types(), IREE_LOC))
       << "Registering HAL types";
@@ -381,12 +391,14 @@
   ASSIGN_OR_RETURN(auto target_backends, GetTargetBackends());
   for (const auto& target_backend : target_backends) {
     // Prepare the module for execution and evaluate it.
+    IREE_TRACE_FRAME_MARK();
     auto cloned_file_buffer = llvm::MemoryBuffer::getMemBufferCopy(
         file_buffer->getBuffer(), file_buffer->getBufferIdentifier());
     ASSIGN_OR_RETURN(
         auto flatbuffer_data,
         PrepareModule(target_backend + '*', std::move(cloned_file_buffer)),
         _ << "Translating module");
+    IREE_TRACE_FRAME_MARK();
     RETURN_IF_ERROR(EvaluateFunctions(
         instance, BackendToDriverName(target_backend), flatbuffer_data))
         << "Evaluating functions";
@@ -398,6 +410,8 @@
 
 // Runs the given .mlir file based on the current flags.
 Status RunFile(const std::string& mlir_filename) {
+  IREE_TRACE_SCOPE0("RunFile");
+
   // Load input file/from stdin.
   std::string error_message;
   auto file = mlir::openInputFile(mlir_filename, &error_message);
@@ -448,6 +462,8 @@
 }  // namespace
 
 extern "C" int main(int argc, char** argv) {
+  IREE_TRACE_SCOPE0("iree-run-mlir");
+
   int argc_llvm = argc;
   char** argv_llvm = argv;
   int argc_absl = 1;
@@ -486,7 +502,7 @@
   }
   argc_absl += run_args_flag.size();
   char** argv_absl_ptr = argv_absl.data();
-  InitializeEnvironment(&argc_absl, &argv_absl_ptr);
+  iree::InitializeEnvironment(&argc_absl, &argv_absl_ptr);
 
   auto status = RunFile(input_file_flag);
   if (!status.ok()) {
diff --git a/iree/tools/run_module_main.cc b/iree/tools/run_module_main.cc
index 3c7fd3c..b67098e 100644
--- a/iree/tools/run_module_main.cc
+++ b/iree/tools/run_module_main.cc
@@ -21,6 +21,7 @@
 #include "iree/base/init.h"
 #include "iree/base/source_location.h"
 #include "iree/base/status.h"
+#include "iree/base/tracing.h"
 #include "iree/modules/hal/hal_module.h"
 #include "iree/tools/vm_util.h"
 #include "iree/vm/bytecode_module.h"
@@ -51,6 +52,7 @@
 namespace {
 
 StatusOr<std::string> GetModuleContentsFromFlags() {
+  IREE_TRACE_SCOPE0("GetModuleContentsFromFlags");
   auto input_file = absl::GetFlag(FLAGS_input_file);
   std::string contents;
   if (input_file == "-") {
@@ -63,6 +65,8 @@
 }
 
 Status Run() {
+  IREE_TRACE_SCOPE0("iree-run-module");
+
   RETURN_IF_ERROR(FromApiStatus(iree_hal_module_register_types(), IREE_LOC))
       << "registering HAL types";
   iree_vm_instance_t* instance = nullptr;
@@ -140,7 +144,7 @@
 }  // namespace
 
 extern "C" int main(int argc, char** argv) {
-  InitializeEnvironment(&argc, &argv);
+  iree::InitializeEnvironment(&argc, &argv);
   CHECK_OK(Run());
   return 0;
 }
diff --git a/iree/vm/invocation.c b/iree/vm/invocation.c
index 31bba7e..ad2df7d 100644
--- a/iree/vm/invocation.c
+++ b/iree/vm/invocation.c
@@ -65,19 +65,12 @@
   return IREE_STATUS_OK;
 }
 
-// TODO(benvanik): implement this as an iree_vm_invocation_t sequence.
 IREE_API_EXPORT iree_status_t IREE_API_CALL iree_vm_invoke(
     iree_vm_context_t* context, iree_vm_function_t function,
     const iree_vm_invocation_policy_t* policy, iree_vm_variant_list_t* inputs,
     iree_vm_variant_list_t* outputs, iree_allocator_t allocator) {
-  // NOTE: it is ok to have no inputs or outputs. If we do have them, though,
-  // they must be valid.
-  // TODO(benvanik): validate outputs capacity.
-  IREE_RETURN_IF_ERROR(iree_vm_validate_function_inputs(function, inputs));
-
   // Allocate a stack on the heap and initialize it.
-  // If we shrunk the stack (or made it so that it could dynamically grow)
-  // then we could stack-allocate it here and not need the allocator at all.
+  // TODO(#1172): allocate this stack on ... the stack when smaller.
   iree_vm_stack_t* stack = NULL;
   IREE_RETURN_IF_ERROR(iree_allocator_malloc(allocator, sizeof(iree_vm_stack_t),
                                              (void**)&stack));
@@ -88,29 +81,45 @@
     return status;
   }
 
+  status =
+      iree_vm_invoke_within(context, stack, function, policy, inputs, outputs);
+
+  iree_vm_stack_deinit(stack);
+  iree_allocator_free(allocator, stack);
+  return status;
+}
+
+// TODO(benvanik): implement this as an iree_vm_invocation_t sequence.
+IREE_API_EXPORT iree_status_t IREE_API_CALL iree_vm_invoke_within(
+    iree_vm_context_t* context, iree_vm_stack_t* stack,
+    iree_vm_function_t function, const iree_vm_invocation_policy_t* policy,
+    iree_vm_variant_list_t* inputs, iree_vm_variant_list_t* outputs) {
+  // NOTE: it is ok to have no inputs or outputs. If we do have them, though,
+  // they must be valid.
+  // TODO(benvanik): validate outputs capacity.
+  IREE_RETURN_IF_ERROR(iree_vm_validate_function_inputs(function, inputs));
+
   iree_vm_stack_frame_t* callee_frame = NULL;
-  status = iree_vm_stack_function_enter(stack, function, &callee_frame);
+  IREE_RETURN_IF_ERROR(
+      iree_vm_stack_function_enter(stack, function, &callee_frame));
 
   // Marshal inputs.
-  if (iree_status_is_ok(status) && inputs) {
-    status = iree_vm_marshal_inputs(inputs, callee_frame);
+  if (inputs) {
+    IREE_RETURN_IF_ERROR(iree_vm_marshal_inputs(inputs, callee_frame));
   }
 
   // Perform execution. Note that for synchronous execution we expect this to
   // complete without yielding.
-  if (iree_status_is_ok(status)) {
-    iree_vm_execution_result_t result;
-    status = function.module->execute(function.module->self, stack,
-                                      callee_frame, &result);
-  }
+  iree_vm_execution_result_t result;
+  IREE_RETURN_IF_ERROR(function.module->execute(function.module->self, stack,
+                                                callee_frame, &result));
 
   // Marshal outputs.
-  if (iree_status_is_ok(status) && outputs) {
-    status = iree_vm_marshal_outputs(callee_frame, outputs);
+  if (outputs) {
+    IREE_RETURN_IF_ERROR(iree_vm_marshal_outputs(callee_frame, outputs));
   }
 
-  iree_vm_stack_function_leave(stack);
-  iree_vm_stack_deinit(stack);
-  iree_allocator_free(allocator, stack);
-  return status;
+  IREE_RETURN_IF_ERROR(iree_vm_stack_function_leave(stack));
+
+  return IREE_STATUS_OK;
 }
diff --git a/iree/vm/invocation.h b/iree/vm/invocation.h
index fc80156..b7a7412 100644
--- a/iree/vm/invocation.h
+++ b/iree/vm/invocation.h
@@ -49,6 +49,13 @@
     const iree_vm_invocation_policy_t* policy, iree_vm_variant_list_t* inputs,
     iree_vm_variant_list_t* outputs, iree_allocator_t allocator);
 
+// Equivalent to iree_vm_invoke but uses an existing |stack|.
+// If the invocation fails the stack may be left in an indeterminate state.
+IREE_API_EXPORT iree_status_t IREE_API_CALL iree_vm_invoke_within(
+    iree_vm_context_t* context, iree_vm_stack_t* stack,
+    iree_vm_function_t function, const iree_vm_invocation_policy_t* policy,
+    iree_vm_variant_list_t* inputs, iree_vm_variant_list_t* outputs);
+
 // TODO(benvanik): document and implement.
 IREE_API_EXPORT iree_status_t IREE_API_CALL iree_vm_invocation_create(
     iree_vm_context_t* context, iree_vm_function_t function,
diff --git a/scripts/git/populate_reference_repo.sh b/scripts/git/populate_reference_repo.sh
index 0d1d871..895baa9 100755
--- a/scripts/git/populate_reference_repo.sh
+++ b/scripts/git/populate_reference_repo.sh
@@ -78,7 +78,6 @@
 populate_repo vk https://github.com/KhronosGroup/Vulkan-Headers.git
 populate_repo vkmem https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator.git
 populate_repo gemmlowp https://github.com/google/gemmlowp.git
-populate_repo google_tracing_framework https://github.com/google/tracing-framework.git
 populate_repo spirv_tools https://github.com/KhronosGroup/SPIRV-Tools.git
 populate_repo spirv_headers https://github.com/KhronosGroup/SPIRV-Headers.git
 
diff --git a/third_party/google_tracing_framework b/third_party/google_tracing_framework
deleted file mode 160000
index 89ca6c2..0000000
--- a/third_party/google_tracing_framework
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 89ca6c25bae7c11d12409739b2ef707ed9afe6c2
diff --git a/third_party/tracy b/third_party/tracy
new file mode 160000
index 0000000..864d86e
--- /dev/null
+++ b/third_party/tracy
@@ -0,0 +1 @@
+Subproject commit 864d86e8b6d21449474db5e9313dbff90aa9c24f