Tracing improvements for the VM, executables, and iree-benchmark-module. (#3560)

diff --git a/iree/base/api.h b/iree/base/api.h
index d733293..61df05c 100644
--- a/iree/base/api.h
+++ b/iree/base/api.h
@@ -540,6 +540,13 @@
   if (IREE_UNLIKELY(var)) {                                                  \
     return IREE_STATUS_IMPL_ANNOTATE_SWITCH_(var, __VA_ARGS__);              \
   }
+#define IREE_STATUS_IMPL_RETURN_AND_EVAL_IF_API_ERROR_(tail_expr, var, ...)  \
+  iree_status_t var = (IREE_STATUS_IMPL_IDENTITY_(                           \
+      IREE_STATUS_IMPL_IDENTITY_(IREE_STATUS_IMPL_GET_EXPR_)(__VA_ARGS__))); \
+  if (IREE_UNLIKELY(var)) {                                                  \
+    (tail_expr);                                                             \
+    return IREE_STATUS_IMPL_ANNOTATE_SWITCH_(var, __VA_ARGS__);              \
+  }
 #define IREE_STATUS_IMPL_IGNORE_ERROR_(var, expr) \
   iree_status_t var = (expr);                     \
   if (IREE_UNLIKELY(var)) iree_status_ignore(var);
@@ -552,6 +559,14 @@
 #define IREE_STATUS_IMPL_RETURN_IF_API_ERROR_(var, expr, ...) \
   iree_status_t var = (expr);                                 \
   if (IREE_UNLIKELY(var)) return var;
+#undef IREE_STATUS_IMPL_RETURN_AND_EVAL_IF_API_ERROR_
+#define IREE_STATUS_IMPL_RETURN_AND_EVAL_IF_API_ERROR_(tail_expr, var, expr, \
+                                                       ...)                  \
+  iree_status_t var = (expr);                                                \
+  if (IREE_UNLIKELY(var)) {                                                  \
+    (tail_expr);                                                             \
+    return var;                                                              \
+  }
 #undef IREE_STATUS_IMPL_IGNORE_ERROR_
 #define IREE_STATUS_IMPL_IGNORE_ERROR_(var, expr) \
   iree_status_t var = (expr);                     \
@@ -593,6 +608,12 @@
       IREE_STATUS_IMPL_CONCAT_(__status_, __COUNTER__), \
       IREE_STATUS_IMPL_IDENTITY_(IREE_STATUS_IMPL_IDENTITY_(__VA_ARGS__)))
 
+// IREE_RETURN_IF_ERROR with a custom expression to evaluate before returning.
+#define IREE_RETURN_AND_EVAL_IF_ERROR(tail_expr, ...)              \
+  IREE_STATUS_IMPL_RETURN_AND_EVAL_IF_API_ERROR_(                  \
+      tail_expr, IREE_STATUS_IMPL_CONCAT_(__status_, __COUNTER__), \
+      IREE_STATUS_IMPL_IDENTITY_(IREE_STATUS_IMPL_IDENTITY_(__VA_ARGS__)))
+
 // Ignores the status result of (expr) regardless of its value.
 //
 // Example:
diff --git a/iree/base/flatbuffer_util.cc b/iree/base/flatbuffer_util.cc
index 379be73..117aea2 100644
--- a/iree/base/flatbuffer_util.cc
+++ b/iree/base/flatbuffer_util.cc
@@ -66,8 +66,7 @@
                                       std::function<void()> deleter,
                                       size_t root_type_size,
                                       VerifierFn verifier_fn) {
-  IREE_TRACE_SCOPE("FlatBufferFileBase::FromBuffer:size", int)
-  (static_cast<int>(buffer_data.size()));
+  IREE_TRACE_SCOPE();
 
   // Sanity check buffer for the minimum size as FlatBuffers doesn't.
   if (buffer_data.size() < 16) {
diff --git a/iree/base/tracing.h b/iree/base/tracing.h
index c8594de..a3226ad 100644
--- a/iree/base/tracing.h
+++ b/iree/base/tracing.h
@@ -263,12 +263,12 @@
 
 // Begins a new zone with the given runtime dynamic string name.
 // The |value| string will be copied into the trace buffer.
-#define IREE_TRACE_ZONE_BEGIN_NAMED_DYNAMIC(zone_id, name, name_length)   \
-  static const iree_tracing_location_t TracyConcat(                       \
-      __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \
-                                            (uint32_t)__LINE__, 0};       \
-  iree_zone_id_t zone_id = iree_tracing_zone_begin_impl(                  \
-      &TracyConcat(__tracy_source_location, __LINE__), name, name_length);
+#define IREE_TRACE_ZONE_BEGIN_NAMED_DYNAMIC(zone_id, name, name_length) \
+  static const iree_tracing_location_t TracyConcat(                     \
+      __tracy_source_location, __LINE__) = {0, __FUNCTION__, __FILE__,  \
+                                            (uint32_t)__LINE__, 0};     \
+  iree_zone_id_t zone_id = iree_tracing_zone_begin_impl(                \
+      &TracyConcat(__tracy_source_location, __LINE__), (name), (name_length));
 
 // Begins an externally defined zone with a dynamic source location.
 // The |file_name|, |function_name|, and optional |name| strings will be copied
@@ -280,6 +280,10 @@
       file_name, file_name_length, line, function_name, function_name_length, \
       name, name_length)
 
+// Appends an integer value to the parent zone. May be called multiple times.
+#define IREE_TRACE_ZONE_APPEND_VALUE(zone_id, value) \
+  ___tracy_emit_zone_value((struct ___tracy_c_zone_context){zone_id, 1}, value);
+
 // Appends a string value to the parent zone. May be called multiple times.
 // The |value| string will be copied into the trace buffer.
 #define IREE_TRACE_ZONE_APPEND_TEXT(...)                                  \
@@ -297,6 +301,11 @@
 #define IREE_TRACE_ZONE_END(zone_id) \
   ___tracy_emit_zone_end((struct ___tracy_c_zone_context){zone_id, 1})
 
+// Ends the current zone before returning on a failure.
+// Sugar for IREE_TRACE_ZONE_END+IREE_RETURN_IF_ERROR.
+#define IREE_RETURN_AND_END_ZONE_IF_ERROR(zone_id, ...) \
+  IREE_RETURN_AND_EVAL_IF_ERROR(IREE_TRACE_ZONE_END(zone_id), __VA_ARGS__)
+
 // Configures the named plot with an IREE_TRACING_PLOT_TYPE_* representation.
 #define IREE_TRACE_SET_PLOT_TYPE(name_literal, plot_type) \
   iree_tracing_set_plot_type_impl(name_literal, plot_type)
@@ -347,8 +356,11 @@
 #define IREE_TRACE_ZONE_BEGIN_EXTERNAL(                        \
     zone_id, file_name, file_name_length, line, function_name, \
     function_name_length, name, name_length)
+#define IREE_TRACE_ZONE_APPEND_VALUE(zone_id, value)
 #define IREE_TRACE_ZONE_APPEND_TEXT(zone_id, value, value_length)
 #define IREE_TRACE_ZONE_END(zone_id)
+#define IREE_RETURN_AND_END_ZONE_IF_ERROR(zone_id, ...) \
+  IREE_RETURN_IF_ERROR(__VA_ARGS__)
 #define IREE_TRACE_SET_PLOT_TYPE(name_literal, plot_type)
 #define IREE_TRACE_PLOT_VALUE_I64(name_literal, value)
 #define IREE_TRACE_PLOT_VALUE_F32(name_literal, value)
@@ -411,17 +423,20 @@
 #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
 
 // TODO(#1886): update these to tracy and drop the 0.
-#define IREE_TRACE_SCOPE0(name_spec) ZoneScopedN(name_spec)
-#define IREE_TRACE_SCOPE(name_spec, ...)
-#define IREE_TRACE_EVENT0
+#define IREE_TRACE_SCOPE() ZoneScoped
+#define IREE_TRACE_SCOPE_DYNAMIC(name_cstr) \
+  ZoneTransientN(___tracy_scoped_zone, name_cstr, true)
+#define IREE_TRACE_SCOPE0(name_literal) ZoneScopedN(name_literal)
 #define IREE_TRACE_EVENT
+#define IREE_TRACE_EVENT0
 
 #else
 #define IREE_TRACE_THREAD_ENABLE(name)
-#define IREE_TRACE_SCOPE0(name_spec)
-#define IREE_TRACE_SCOPE(name_spec, ...) (void)
-#define IREE_TRACE_EVENT0
+#define IREE_TRACE_SCOPE()
+#define IREE_TRACE_SCOPE_DYNAMIC(name_string_view)
+#define IREE_TRACE_SCOPE0(name_literal)
 #define IREE_TRACE_EVENT(void)
+#define IREE_TRACE_EVENT0
 #endif  // IREE_TRACING_FEATURE_INSTRUMENTATION
 
 // TODO(benvanik): macros for LockableCtx / Lockable mutex tracking.
diff --git a/iree/hal/device_manager.cc b/iree/hal/device_manager.cc
index d3c3301..5a527f3 100644
--- a/iree/hal/device_manager.cc
+++ b/iree/hal/device_manager.cc
@@ -108,8 +108,7 @@
     MemoryTypeBitfield memory_type, BufferUsageBitfield buffer_usage,
     device_size_t allocation_size,
     absl::Span<const DevicePlacement> device_placements) {
-  IREE_TRACE_SCOPE("DeviceManager::TryAllocateDeviceVisibleBuffer:size", int)
-  (static_cast<int>(allocation_size));
+  IREE_TRACE_SCOPE0("DeviceManager::TryAllocateDeviceVisibleBuffer:size");
   if (!AnyBitSet(memory_type & MemoryType::kHostLocal)) {
     return InvalidArgumentErrorBuilder(IREE_LOC)
            << "Host-local buffers require the kHostLocal bit: "
@@ -138,8 +137,7 @@
     MemoryTypeBitfield memory_type, BufferUsageBitfield buffer_usage,
     device_size_t allocation_size,
     absl::Span<const DevicePlacement> device_placements) {
-  IREE_TRACE_SCOPE("DeviceManager::AllocateDeviceVisibleBuffer:size", int)
-  (static_cast<int>(allocation_size));
+  IREE_TRACE_SCOPE0("DeviceManager::AllocateDeviceVisibleBuffer:size");
   if (!AnyBitSet(memory_type & MemoryType::kHostLocal)) {
     return InvalidArgumentErrorBuilder(IREE_LOC)
            << "Host-local buffers require the kHostLocal bit: "
@@ -160,8 +158,7 @@
     MemoryTypeBitfield memory_type, BufferUsageBitfield buffer_usage,
     device_size_t allocation_size,
     absl::Span<const DevicePlacement> device_placements) {
-  IREE_TRACE_SCOPE("DeviceManager::AllocateDeviceLocalBuffer:size", int)
-  (static_cast<int>(allocation_size));
+  IREE_TRACE_SCOPE0("DeviceManager::AllocateDeviceLocalBuffer:size");
   if (!AnyBitSet(memory_type & MemoryType::kDeviceLocal)) {
     return InvalidArgumentErrorBuilder(IREE_LOC)
            << "Device-local buffers require the kDeviceLocal bit: "
diff --git a/iree/hal/dylib/dylib_executable.cc b/iree/hal/dylib/dylib_executable.cc
index def1786..10dbf29 100644
--- a/iree/hal/dylib/dylib_executable.cc
+++ b/iree/hal/dylib/dylib_executable.cc
@@ -16,7 +16,6 @@
 
 #include "flatbuffers/flatbuffers.h"
 #include "iree/base/file_io.h"
-#include "iree/base/tracing.h"
 #include "iree/schemas/dylib_executable_def_generated.h"
 
 namespace iree {
@@ -89,6 +88,10 @@
              << "Could not find symbol: " << entry_points[i];
     }
     entry_functions_[i] = symbol;
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+    entry_names_[i] = entry_points[i]->c_str();
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
   }
 
   return OkStatus();
@@ -96,6 +99,11 @@
 
 struct DyLibDispatchState : public HostExecutable::DispatchState {
   DyLibDispatchState() = default;
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  const char* entry_name = nullptr;
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
   void* entry_function = nullptr;
   absl::InlinedVector<void*, 4> args;
   absl::InlinedVector<int32_t, 4> push_constant;
@@ -111,6 +119,9 @@
   }
 
   auto dispatch_state = make_ref<DyLibDispatchState>();
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  dispatch_state->entry_name = entry_names_[params.entry_point];
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
   dispatch_state->entry_function = entry_functions_[params.entry_point];
 
   for (size_t set = 0; set < params.set_bindings.size(); ++set) {
@@ -135,8 +146,8 @@
 
 Status DyLibExecutable::DispatchTile(DispatchState* state,
                                      std::array<uint32_t, 3> workgroup_xyz) {
-  IREE_TRACE_SCOPE0("DyLibExecutable::DispatchTile");
   auto* dispatch_state = static_cast<DyLibDispatchState*>(state);
+  IREE_TRACE_SCOPE_DYNAMIC(dispatch_state->entry_name);
 
   auto entry_function =
       (void (*)(void**, int32_t*))dispatch_state->entry_function;
diff --git a/iree/hal/dylib/dylib_executable.h b/iree/hal/dylib/dylib_executable.h
index 63c4a26..210be15 100644
--- a/iree/hal/dylib/dylib_executable.h
+++ b/iree/hal/dylib/dylib_executable.h
@@ -21,6 +21,7 @@
 #include "absl/container/inlined_vector.h"
 #include "iree/base/dynamic_library.h"
 #include "iree/base/status.h"
+#include "iree/base/tracing.h"
 #include "iree/hal/executable_spec.h"
 #include "iree/hal/host/host_executable.h"
 
@@ -50,6 +51,10 @@
   std::string executable_library_temp_path_;
   std::unique_ptr<DynamicLibrary> executable_library_;
   absl::InlinedVector<void*, 4> entry_functions_;
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  absl::InlinedVector<const char*, 4> entry_names_;
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
 };
 
 }  // namespace dylib
diff --git a/iree/hal/host/BUILD b/iree/hal/host/BUILD
index c7b28aa..a26fa0f 100644
--- a/iree/hal/host/BUILD
+++ b/iree/hal/host/BUILD
@@ -55,6 +55,7 @@
     deps = [
         "//iree/base:logging",
         "//iree/base:status",
+        "//iree/base:tracing",
         "//iree/hal:buffer",
     ],
 )
diff --git a/iree/hal/host/CMakeLists.txt b/iree/hal/host/CMakeLists.txt
index 803d7b6..953a777 100644
--- a/iree/hal/host/CMakeLists.txt
+++ b/iree/hal/host/CMakeLists.txt
@@ -55,6 +55,7 @@
   DEPS
     iree::base::logging
     iree::base::status
+    iree::base::tracing
     iree::hal::buffer
   PUBLIC
 )
diff --git a/iree/hal/host/host_buffer.cc b/iree/hal/host/host_buffer.cc
index c5016f3..265e048 100644
--- a/iree/hal/host/host_buffer.cc
+++ b/iree/hal/host/host_buffer.cc
@@ -20,6 +20,7 @@
 
 #include "iree/base/logging.h"
 #include "iree/base/status.h"
+#include "iree/base/tracing.h"
 
 namespace iree {
 namespace hal {
@@ -36,6 +37,7 @@
       owns_data_(owns_data) {}
 
 HostBuffer::~HostBuffer() {
+  IREE_TRACE_SCOPE();
   if (owns_data_ && data_) {
     std::free(data_);
     data_ = nullptr;
diff --git a/iree/hal/vmla/vmla_executable.cc b/iree/hal/vmla/vmla_executable.cc
index 336c610..4cc447a 100644
--- a/iree/hal/vmla/vmla_executable.cc
+++ b/iree/hal/vmla/vmla_executable.cc
@@ -156,8 +156,9 @@
 
 Status VMLAExecutable::DispatchTile(DispatchState* state,
                                     std::array<uint32_t, 3> workgroup_xyz) {
-  IREE_TRACE_SCOPE0("VMLAExecutable::DispatchTile");
   auto* dispatch_state = static_cast<VMLADispatchState*>(state);
+  IREE_TRACE_SCOPE_DYNAMIC(
+      iree_vm_function_name(&dispatch_state->function).data);
 
   auto* input_list_storage = alloca(dispatch_state->input_list_size);
   iree_vm_list_t* input_list = nullptr;
diff --git a/iree/modules/hal/hal_module.cc b/iree/modules/hal/hal_module.cc
index c990cf0..be8a730 100644
--- a/iree/modules/hal/hal_module.cc
+++ b/iree/modules/hal/hal_module.cc
@@ -156,10 +156,13 @@
     IREE_RETURN_IF_ERROR(iree_hal_semaphore_wait_with_deadline(
         semaphore.get(), 1ull, IREE_TIME_INFINITE_FUTURE));
 
-    for (auto& ref : deferred_releases_) {
-      iree_vm_ref_release(&ref);
+    {
+      IREE_TRACE_SCOPE0("HALModuleState::DeferredReleases");
+      for (auto& ref : deferred_releases_) {
+        iree_vm_ref_release(&ref);
+      }
+      deferred_releases_.clear();
     }
-    deferred_releases_.clear();
 
     return OkStatus();
   }
diff --git a/iree/tools/iree-benchmark-module-main.cc b/iree/tools/iree-benchmark-module-main.cc
index 96dc0c9..af0ab9f 100644
--- a/iree/tools/iree-benchmark-module-main.cc
+++ b/iree/tools/iree-benchmark-module-main.cc
@@ -59,38 +59,39 @@
 namespace iree {
 namespace {
 
+static void BenchmarkFunction(
+    const std::string& benchmark_name, iree_vm_context_t* context,
+    iree_vm_function_t function, iree_vm_list_t* inputs,
+    const std::vector<RawSignatureParser::Description>& output_descs,
+    benchmark::State& state) {
+  IREE_TRACE_SCOPE_DYNAMIC(benchmark_name.c_str());
+  IREE_TRACE_FRAME_MARK();
+
+  // Benchmarking loop.
+  for (auto _ : state) {
+    IREE_TRACE_SCOPE0("BenchmarkIteration");
+    IREE_TRACE_FRAME_MARK_NAMED("Iteration");
+    vm::ref<iree_vm_list_t> outputs;
+    IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr,
+                                      output_descs.size(),
+                                      iree_allocator_system(), &outputs));
+    IREE_CHECK_OK(iree_vm_invoke(context, function, /*policy=*/nullptr, inputs,
+                                 outputs.get(), iree_allocator_system()));
+  }
+}
+
 void RegisterModuleBenchmarks(
     const std::string& function_name, iree_vm_context_t* context,
     iree_vm_function_t function, iree_vm_list_t* inputs,
     const std::vector<RawSignatureParser::Description>& output_descs) {
   auto benchmark_name = "BM_" + function_name;
-  benchmark::RegisterBenchmark(
-      benchmark_name.c_str(),
-      [context, function, inputs,
-       output_descs](benchmark::State& state) -> void {
-        // Warmup run step.
-        {
-          vm::ref<iree_vm_list_t> outputs;
-          IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr,
-                                            output_descs.size(),
-                                            iree_allocator_system(), &outputs));
-          IREE_CHECK_OK(iree_vm_invoke(context, function, /*policy=*/nullptr,
-                                       inputs, outputs.get(),
-                                       iree_allocator_system()));
-        }
-        // Benchmarking loop.
-        for (auto _ : state) {
-          // No status conversions and conditional returns in the benchmarked
-          // inner loop.
-          vm::ref<iree_vm_list_t> outputs;
-          IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr,
-                                            output_descs.size(),
-                                            iree_allocator_system(), &outputs));
-          IREE_CHECK_OK(iree_vm_invoke(context, function, /*policy=*/nullptr,
-                                       inputs, outputs.get(),
-                                       iree_allocator_system()));
-        }
-      })
+  benchmark::RegisterBenchmark(benchmark_name.c_str(),
+                               [benchmark_name, context, function, inputs,
+                                output_descs](benchmark::State& state) -> void {
+                                 BenchmarkFunction(benchmark_name, context,
+                                                   function, inputs,
+                                                   output_descs, state);
+                               })
       // By default only the main thread is included in CPU time. Include all
       // the threads instead.
       ->MeasureProcessCPUTime()
@@ -107,6 +108,7 @@
 }
 
 Status GetModuleContentsFromFlags(std::string& module_data) {
+  IREE_TRACE_SCOPE0("GetModuleContentsFromFlags");
   auto module_file = absl::GetFlag(FLAGS_module_file);
   IREE_ASSIGN_OR_RETURN(module_data, file_io::GetFileContents(module_file));
   return iree::OkStatus();
@@ -127,6 +129,8 @@
         context_(nullptr),
         input_module_(nullptr){};
   ~IREEBenchmark() {
+    IREE_TRACE_SCOPE0("IREEBenchmark::dtor");
+
     // Order matters.
     inputs_.reset();
     iree_vm_module_release(hal_module_);
@@ -137,6 +141,8 @@
   };
 
   Status Register() {
+    IREE_TRACE_SCOPE0("IREEBenchmark::Register");
+
     if (!instance_ || !device_ || !hal_module_ || !context_ || !input_module_) {
       IREE_RETURN_IF_ERROR(Init());
     }
@@ -152,6 +158,9 @@
 
  private:
   Status Init() {
+    IREE_TRACE_SCOPE0("IREEBenchmark::Init");
+    IREE_TRACE_FRAME_MARK_BEGIN_NAMED("init");
+
     IREE_RETURN_IF_ERROR(GetModuleContentsFromFlags(module_data_));
 
     IREE_RETURN_IF_ERROR(iree_hal_module_register_types());
@@ -170,10 +179,14 @@
     IREE_RETURN_IF_ERROR(iree_vm_context_create_with_modules(
         instance_, modules.data(), modules.size(), iree_allocator_system(),
         &context_));
+
+    IREE_TRACE_FRAME_MARK_END_NAMED("init");
     return iree::OkStatus();
   }
 
   Status RegisterSpecificFunction(const std::string& function_name) {
+    IREE_TRACE_SCOPE0("IREEBenchmark::RegisterSpecificFunction");
+
     iree_vm_function_t function;
     IREE_RETURN_IF_ERROR(input_module_->lookup_function(
         input_module_->self, IREE_VM_FUNCTION_LINKAGE_EXPORT,
@@ -203,6 +216,7 @@
   }
 
   Status RegisterAllExportedFunctions() {
+    IREE_TRACE_SCOPE0("IREEBenchmark::RegisterAllExportedFunctions");
     iree_vm_function_t function;
     iree_vm_module_signature_t signature =
         input_module_->signature(input_module_->self);
@@ -239,6 +253,8 @@
 }  // namespace iree
 
 int main(int argc, char** argv) {
+  IREE_TRACE_SCOPE0("main");
+
   // We have to contend with two flag parsing libraries here: absl's and
   // benchmark's. To make matters worse, both define the `--help` flag. To
   // ensure that each is able to parse its own flags, we use an absl "internal"
diff --git a/iree/vm/BUILD b/iree/vm/BUILD
index 07f1324..3f3d93e 100644
--- a/iree/vm/BUILD
+++ b/iree/vm/BUILD
@@ -170,6 +170,7 @@
         ":builtin_types",
         "//iree/base:api",
         "//iree/base:atomics",
+        "//iree/base:tracing",
     ],
 )
 
@@ -221,6 +222,7 @@
         "//iree/base:alignment",
         "//iree/base:api",
         "//iree/base:atomics",
+        "//iree/base:tracing",
     ],
 )
 
diff --git a/iree/vm/CMakeLists.txt b/iree/vm/CMakeLists.txt
index e17375e..48d6544 100644
--- a/iree/vm/CMakeLists.txt
+++ b/iree/vm/CMakeLists.txt
@@ -190,6 +190,7 @@
     ::builtin_types
     iree::base::api
     iree::base::atomics
+    iree::base::tracing
   PUBLIC
 )
 
@@ -251,6 +252,7 @@
     iree::base::alignment
     iree::base::api
     iree::base::atomics
+    iree::base::tracing
   PUBLIC
 )
 
diff --git a/iree/vm/bytecode_dispatch.c b/iree/vm/bytecode_dispatch.c
index 72ba1ab..bb5fbbf 100644
--- a/iree/vm/bytecode_dispatch.c
+++ b/iree/vm/bytecode_dispatch.c
@@ -14,6 +14,7 @@
 
 #include <string.h>
 
+#include "iree/base/tracing.h"
 #include "iree/vm/bytecode_dispatch_util.h"
 #include "iree/vm/list.h"
 
diff --git a/iree/vm/bytecode_module.c b/iree/vm/bytecode_module.c
index 8e564bd..caa89d3 100644
--- a/iree/vm/bytecode_module.c
+++ b/iree/vm/bytecode_module.c
@@ -16,6 +16,7 @@
 
 #include "iree/base/alignment.h"
 #include "iree/base/api.h"
+#include "iree/base/tracing.h"
 #include "iree/vm/bytecode_module_impl.h"
 #include "iree/vm/ref.h"
 #include "iree/vm/stack.h"
@@ -79,15 +80,18 @@
 // registered.
 static iree_status_t iree_vm_bytecode_module_resolve_types(
     iree_vm_TypeDef_vec_t type_defs, iree_vm_type_def_t* type_table) {
+  IREE_TRACE_ZONE_BEGIN(z0);
   for (size_t i = 0; i < iree_vm_TypeDef_vec_len(type_defs); ++i) {
     iree_vm_TypeDef_table_t type_def = iree_vm_TypeDef_vec_at(type_defs, i);
     type_table[i] = iree_vm_bytecode_module_resolve_type(type_def);
     if (!iree_vm_type_def_is_valid(type_table[i])) {
+      IREE_TRACE_ZONE_END(z0);
       return iree_make_status(IREE_STATUS_NOT_FOUND,
                               "no type registered with name '%s'",
                               iree_vm_TypeDef_full_name(type_def));
     }
   }
+  IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
 
@@ -249,6 +253,7 @@
 
 static void iree_vm_bytecode_module_destroy(void* self) {
   iree_vm_bytecode_module_t* module = (iree_vm_bytecode_module_t*)self;
+  IREE_TRACE_ZONE_BEGIN(z0);
 
   iree_allocator_free(module->flatbuffer_allocator,
                       (void*)module->flatbuffer_data.data);
@@ -256,6 +261,8 @@
   module->flatbuffer_allocator = iree_allocator_null();
 
   iree_allocator_free(module->allocator, module);
+
+  IREE_TRACE_ZONE_END(z0);
 }
 
 static iree_string_view_t iree_vm_bytecode_module_name(void* self) {
@@ -544,6 +551,7 @@
 static iree_status_t iree_vm_bytecode_module_alloc_state(
     void* self, iree_allocator_t allocator,
     iree_vm_module_state_t** out_module_state) {
+  IREE_TRACE_ZONE_BEGIN(z0);
   IREE_ASSERT_ARGUMENT(out_module_state);
   *out_module_state = NULL;
 
@@ -556,8 +564,9 @@
 
   // Allocate the storage for the structure and all its nested tables.
   iree_vm_bytecode_module_state_t* state = NULL;
-  IREE_RETURN_IF_ERROR(iree_allocator_malloc(allocator, total_state_struct_size,
-                                             (void**)&state));
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(allocator, total_state_struct_size,
+                                (void**)&state));
   state->allocator = allocator;
 
   // Perform layout to get the pointers into the storage for each nested table.
@@ -577,12 +586,14 @@
   }
 
   *out_module_state = (iree_vm_module_state_t*)state;
+  IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
 
 static void iree_vm_bytecode_module_free_state(
     void* self, iree_vm_module_state_t* module_state) {
   if (!module_state) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
 
   iree_vm_bytecode_module_state_t* state =
       (iree_vm_bytecode_module_state_t*)module_state;
@@ -593,6 +604,8 @@
   }
 
   iree_allocator_free(state->allocator, module_state);
+
+  IREE_TRACE_ZONE_END(z0);
 }
 
 static iree_status_t iree_vm_bytecode_module_resolve_import(
@@ -645,7 +658,7 @@
   // NOTE: any work here adds directly to the invocation time. Avoid doing too
   // much work or touching too many unlikely-to-be-cached structures (such as
   // walking the FlatBuffer, which may cause page faults).
-
+  IREE_TRACE_ZONE_BEGIN(z0);
   IREE_ASSERT_ARGUMENT(out_result);
   memset(out_result, 0, sizeof(iree_vm_execution_result_t));
 
@@ -653,12 +666,15 @@
   // allow exports here as well to make things easier to call externally.
   iree_vm_function_t function = call->function;
   if (function.linkage != IREE_VM_FUNCTION_LINKAGE_INTERNAL) {
-    IREE_RETURN_IF_ERROR(iree_vm_bytecode_module_get_function(
-        self, function.linkage, function.ordinal, &function, NULL, NULL));
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0,
+        iree_vm_bytecode_module_get_function(
+            self, function.linkage, function.ordinal, &function, NULL, NULL));
   }
 
   iree_vm_bytecode_module_t* module = (iree_vm_bytecode_module_t*)self;
   if (function.ordinal >= module->function_descriptor_count) {
+    IREE_TRACE_ZONE_END(z0);
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "function ordinal out of range (0 < %u < %zu)",
                             function.ordinal,
@@ -688,28 +704,40 @@
       flatbuffers_string_len(calling_convention);
   iree_string_view_t cconv_arguments = iree_string_view_empty();
   iree_string_view_t cconv_results = iree_string_view_empty();
-  IREE_RETURN_IF_ERROR(iree_vm_function_call_get_cconv_fragments(
-      &signature, &cconv_arguments, &cconv_results));
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_vm_function_call_get_cconv_fragments(
+              &signature, &cconv_arguments, &cconv_results));
 
   // Jump into the dispatch routine to execute bytecode until the function
   // either returns (synchronous) or yields (asynchronous).
-  return iree_vm_bytecode_dispatch(stack, module, call, cconv_arguments,
-                                   cconv_results, out_result);
+  iree_status_t status = iree_vm_bytecode_dispatch(
+      stack, module, call, cconv_arguments, cconv_results, out_result);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
 }
 
 IREE_API_EXPORT iree_status_t IREE_API_CALL iree_vm_bytecode_module_create(
     iree_const_byte_span_t flatbuffer_data,
     iree_allocator_t flatbuffer_allocator, iree_allocator_t allocator,
     iree_vm_module_t** out_module) {
+  IREE_TRACE_ZONE_BEGIN(z0);
   IREE_ASSERT_ARGUMENT(out_module);
   *out_module = NULL;
 
-  IREE_RETURN_IF_ERROR(
-      iree_vm_bytecode_module_flatbuffer_verify(flatbuffer_data));
+  IREE_TRACE_ZONE_BEGIN_NAMED(z1, "iree_vm_bytecode_module_flatbuffer_verify");
+  iree_status_t status =
+      iree_vm_bytecode_module_flatbuffer_verify(flatbuffer_data);
+  if (!iree_status_is_ok(status)) {
+    IREE_TRACE_ZONE_END(z1);
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+  IREE_TRACE_ZONE_END(z1);
 
   iree_vm_BytecodeModuleDef_table_t module_def =
       iree_vm_BytecodeModuleDef_as_root(flatbuffer_data.data);
   if (!module_def) {
+    IREE_TRACE_ZONE_END(z0);
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
         "failed getting root from flatbuffer; expected identifier "
@@ -721,9 +749,10 @@
       iree_vm_TypeDef_vec_len(type_defs) * sizeof(iree_vm_type_def_t);
 
   iree_vm_bytecode_module_t* module = NULL;
-  IREE_RETURN_IF_ERROR(iree_allocator_malloc(
-      allocator, sizeof(iree_vm_bytecode_module_t) + type_table_size,
-      (void**)&module));
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(
+              allocator, sizeof(iree_vm_bytecode_module_t) + type_table_size,
+              (void**)&module));
   module->allocator = allocator;
 
   iree_vm_FunctionDescriptor_vec_t function_descriptors =
@@ -748,6 +777,7 @@
       iree_vm_bytecode_module_resolve_types(type_defs, module->type_table);
   if (!iree_status_is_ok(resolve_status)) {
     iree_allocator_free(allocator, module);
+    IREE_TRACE_ZONE_END(z0);
     return resolve_status;
   }
 
@@ -765,5 +795,6 @@
       iree_vm_bytecode_module_get_function_reflection_attr;
 
   *out_module = &module->interface;
+  IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
diff --git a/iree/vm/context.c b/iree/vm/context.c
index e0d2b8b..e211adc 100644
--- a/iree/vm/context.c
+++ b/iree/vm/context.c
@@ -89,6 +89,8 @@
 static iree_status_t iree_vm_context_resolve_module_imports(
     iree_vm_context_t* context, iree_vm_module_t* module,
     iree_vm_module_state_t* module_state) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
   // NOTE: this has some bad characteristics, but the number of modules and the
   // number of imported functions should be relatively small (even if the number
   // of exported functions for particular modules is large).
@@ -96,7 +98,8 @@
   for (int i = 0; i < module_signature.import_function_count; ++i) {
     iree_string_view_t full_name;
     iree_vm_function_signature_t expected_signature;
-    IREE_RETURN_IF_ERROR(
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0,
         module->get_function(module->self, IREE_VM_FUNCTION_LINKAGE_IMPORT, i,
                              /*out_function=*/NULL,
                              /*out_name=*/&full_name,
@@ -105,7 +108,8 @@
     // Resolve the function to the module that contains it and return the
     // information.
     iree_vm_function_t import_function;
-    IREE_RETURN_IF_ERROR(
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0,
         iree_vm_context_resolve_function(context, full_name, &import_function));
 
     // Query the function signature from the module that contains it; we don't
@@ -127,6 +131,7 @@
     if (expected_signature.calling_convention.size &&
         !iree_string_view_equal(import_signature.calling_convention,
                                 expected_signature.calling_convention)) {
+      IREE_TRACE_ZONE_END(z0);
       return iree_make_status(
           IREE_STATUS_INTERNAL,
           "import function signature mismatch between %.*s "
@@ -141,9 +146,12 @@
           import_signature.calling_convention.data);
     }
 
-    IREE_RETURN_IF_ERROR(module->resolve_import(
-        module->self, module_state, i, &import_function, &import_signature));
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, module->resolve_import(module->self, module_state, i,
+                                   &import_function, &import_signature));
   }
+
+  IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
 
@@ -199,6 +207,7 @@
     iree_vm_instance_t* instance, iree_vm_module_t** modules,
     iree_host_size_t module_count, iree_allocator_t allocator,
     iree_vm_context_t** out_context) {
+  IREE_TRACE_ZONE_BEGIN(z0);
   IREE_ASSERT_ARGUMENT(instance);
   IREE_ASSERT_ARGUMENT(out_context);
   *out_context = NULL;
@@ -231,10 +240,12 @@
       iree_vm_context_register_modules(context, modules, module_count);
   if (!iree_status_is_ok(register_status)) {
     iree_vm_context_destroy(context);
+    IREE_TRACE_ZONE_END(z0);
     return register_status;
   }
 
   *out_context = context;
+  IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
 
@@ -317,9 +328,12 @@
     }
   }
 
+  IREE_TRACE_ZONE_BEGIN(z0);
+
   // Try growing both our storage lists first, if needed.
   if (context->list.count + module_count > context->list.capacity) {
     if (context->is_static) {
+      IREE_TRACE_ZONE_END(z0);
       return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
                               "context was allocated as static and cannot "
                               "register modules after creation");
@@ -330,13 +344,16 @@
       new_capacity = context->list.capacity * 2;
     }
     iree_vm_module_t** new_module_list;
-    IREE_RETURN_IF_ERROR(iree_allocator_malloc(
-        context->allocator, sizeof(iree_vm_module_t*) * new_capacity,
-        (void**)&new_module_list));
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_allocator_malloc(context->allocator,
+                                  sizeof(iree_vm_module_t*) * new_capacity,
+                                  (void**)&new_module_list));
     iree_vm_module_state_t** new_module_state_list;
-    IREE_RETURN_IF_ERROR(iree_allocator_malloc(
-        context->allocator, sizeof(iree_vm_module_state_t*) * new_capacity,
-        (void**)&new_module_state_list));
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0,
+        iree_allocator_malloc(context->allocator,
+                              sizeof(iree_vm_module_state_t*) * new_capacity,
+                              (void**)&new_module_state_list));
     memcpy(new_module_list, context->list.modules,
            sizeof(iree_vm_module_t*) * context->list.count);
     memcpy(new_module_state_list, context->list.module_states,
@@ -409,12 +426,14 @@
     context->list.count = original_count;
   }
 
+  IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
 IREE_API_EXPORT iree_status_t IREE_API_CALL iree_vm_context_resolve_function(
     const iree_vm_context_t* context, iree_string_view_t full_name,
     iree_vm_function_t* out_function) {
+  IREE_TRACE_ZONE_BEGIN(z0);
   IREE_ASSERT_ARGUMENT(out_function);
   memset(out_function, 0, sizeof(iree_vm_function_t));
 
@@ -422,6 +441,7 @@
   iree_string_view_t function_name;
   if (iree_string_view_split(full_name, '.', &module_name, &function_name) ==
       -1) {
+    IREE_TRACE_ZONE_END(z0);
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
         "import name not fully-qualified (module.func): '%.*s'",
@@ -430,13 +450,15 @@
 
   for (int i = (int)context->list.count - 1; i >= 0; --i) {
     iree_vm_module_t* module = context->list.modules[i];
-    if (iree_string_view_compare(module_name, iree_vm_module_name(module)) ==
-        0) {
-      return iree_vm_module_lookup_function_by_name(
+    if (iree_string_view_equal(module_name, iree_vm_module_name(module))) {
+      iree_status_t status = iree_vm_module_lookup_function_by_name(
           module, IREE_VM_FUNCTION_LINKAGE_EXPORT, function_name, out_function);
+      IREE_TRACE_ZONE_END(z0);
+      return status;
     }
   }
 
+  IREE_TRACE_ZONE_END(z0);
   return iree_make_status(IREE_STATUS_NOT_FOUND,
                           "module '%.*s' required for import '%.*s' not "
                           "registered with the context",
diff --git a/iree/vm/instance.c b/iree/vm/instance.c
index 5cc92d8..1ca17b4 100644
--- a/iree/vm/instance.c
+++ b/iree/vm/instance.c
@@ -15,6 +15,7 @@
 #include "iree/vm/instance.h"
 
 #include "iree/base/atomics.h"
+#include "iree/base/tracing.h"
 #include "iree/vm/builtin_types.h"
 
 struct iree_vm_instance {
@@ -24,24 +25,29 @@
 
 IREE_API_EXPORT iree_status_t IREE_API_CALL iree_vm_instance_create(
     iree_allocator_t allocator, iree_vm_instance_t** out_instance) {
+  IREE_TRACE_ZONE_BEGIN(z0);
   IREE_ASSERT_ARGUMENT(out_instance);
   *out_instance = NULL;
 
-  IREE_RETURN_IF_ERROR(iree_vm_register_builtin_types());
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(z0, iree_vm_register_builtin_types());
 
   iree_vm_instance_t* instance = NULL;
-  IREE_RETURN_IF_ERROR(iree_allocator_malloc(
-      allocator, sizeof(iree_vm_instance_t), (void**)&instance));
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(allocator, sizeof(iree_vm_instance_t),
+                                (void**)&instance));
   instance->allocator = allocator;
   iree_atomic_ref_count_init(&instance->ref_count);
 
   *out_instance = instance;
+  IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
 
 static void iree_vm_instance_destroy(iree_vm_instance_t* instance) {
+  IREE_TRACE_ZONE_BEGIN(z0);
   IREE_ASSERT_ARGUMENT(instance);
   iree_allocator_free(instance->allocator, instance);
+  IREE_TRACE_ZONE_END(z0);
 }
 
 IREE_API_EXPORT void IREE_API_CALL
diff --git a/iree/vm/module.c b/iree/vm/module.c
index 934896f..54e602f 100644
--- a/iree/vm/module.c
+++ b/iree/vm/module.c
@@ -17,6 +17,7 @@
 #include <string.h>
 
 #include "iree/base/atomics.h"
+#include "iree/base/tracing.h"
 #include "iree/vm/ref.h"
 
 IREE_API_EXPORT iree_status_t IREE_API_CALL
@@ -138,9 +139,11 @@
 
 IREE_API_EXPORT iree_status_t IREE_API_CALL
 iree_vm_module_initialize(iree_vm_module_t* module, void* self) {
+  IREE_TRACE_ZONE_BEGIN(z0);
   memset(module, 0, sizeof(iree_vm_module_t));
   module->self = self;
   iree_atomic_ref_count_init(&module->ref_count);
+  IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }
 
@@ -225,8 +228,10 @@
 IREE_API_EXPORT iree_string_view_t IREE_API_CALL
 iree_vm_function_reflection_attr(const iree_vm_function_t* function,
                                  iree_string_view_t key) {
+  IREE_TRACE_ZONE_BEGIN(z0);
   iree_vm_module_t* module = function->module;
   if (!module->get_function_reflection_attr) {
+    IREE_TRACE_ZONE_END(z0);
     return iree_string_view_empty();
   }
   for (int index = 0;; ++index) {
@@ -239,9 +244,11 @@
       break;
     }
     if (iree_string_view_compare(key, index_key) == 0) {
+      IREE_TRACE_ZONE_END(z0);
       return index_value;
     }
   }
+  IREE_TRACE_ZONE_END(z0);
   return iree_string_view_empty();
 }
 
diff --git a/iree/vm/stack.c b/iree/vm/stack.c
index 6fe9e36..8acdd88 100644
--- a/iree/vm/stack.c
+++ b/iree/vm/stack.c
@@ -419,6 +419,15 @@
   stack->frame_storage_size = new_top;
   stack->top = frame_header;
 
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  // TODO(benvanik): cache source location and query from module.
+  iree_string_view_t function_name = iree_vm_function_name(function);
+  IREE_TRACE_ZONE_BEGIN_NAMED_DYNAMIC(z0, function_name.data,
+                                      function_name.size);
+  callee_frame->trace_zone = z0;
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, frame_size);
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
   if (out_callee_frame) *out_callee_frame = callee_frame;
   return iree_ok_status();
 }
@@ -435,6 +444,8 @@
     stack->top->frame_cleanup_fn(&stack->top->frame);
   }
 
+  IREE_TRACE_ZONE_END(stack->top->frame.trace_zone);
+
   // Restore the frame pointer to the caller.
   stack->frame_storage_size -= stack->top->frame_size;
   stack->top = stack->top->parent;
diff --git a/iree/vm/stack.h b/iree/vm/stack.h
index 3f1c580..98fe693 100644
--- a/iree/vm/stack.h
+++ b/iree/vm/stack.h
@@ -20,6 +20,7 @@
 
 #include "iree/base/alignment.h"
 #include "iree/base/api.h"
+#include "iree/base/tracing.h"
 #include "iree/vm/module.h"
 #include "iree/vm/ref.h"
 
@@ -85,6 +86,10 @@
   // offset (such as in the case of VM bytecode), a block identifier (compiled
   // code), etc.
   iree_vm_source_offset_t pc;
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  iree_zone_id_t trace_zone;
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
 } iree_vm_stack_frame_t;
 
 // Returns the implementation-defined frame storage associated with |frame|.