Adding a HAL module debug sink interface. (#18966)

HAL module creators can now pass in a debug sink that injects callbacks
that can be made by the HAL module on request from programs or by the
HAL module itself. Today it just has a buffer view trace callback but in
the future we could use this to report profiling information or
asynchronous errors.

A default `iree_hal_module_debug_sink_stdio(file)` is provided that
preserves the current behavior of writing trace buffers to stderr (when
HAL string utils are enabled). `iree_hal_module_debug_sink_null()` can
be used to ensure no string handling code is compiled in if a caller
doesn't want it. Callers can also implement whatever behavior they want.
diff --git a/compiler/src/iree/compiler/ConstEval/Runtime.cpp b/compiler/src/iree/compiler/ConstEval/Runtime.cpp
index 92a64cd..38a9524 100644
--- a/compiler/src/iree/compiler/ConstEval/Runtime.cpp
+++ b/compiler/src/iree/compiler/ConstEval/Runtime.cpp
@@ -471,6 +471,7 @@
     std::array<iree_hal_device_t *, 1> devices = {device.get()};
     status = iree_hal_module_create(runtime.instance.get(), devices.size(),
                                     devices.data(), IREE_HAL_MODULE_FLAG_NONE,
+                                    iree_hal_module_debug_sink_stdio(stderr),
                                     iree_allocator_system(), &hal_module);
   }
 
diff --git a/integrations/pjrt/src/iree_pjrt/common/api_impl.cc b/integrations/pjrt/src/iree_pjrt/common/api_impl.cc
index bbadf5c..694cfa8 100644
--- a/integrations/pjrt/src/iree_pjrt/common/api_impl.cc
+++ b/integrations/pjrt/src/iree_pjrt/common/api_impl.cc
@@ -1601,7 +1601,8 @@
   modules.push_back({});
   IREE_RETURN_IF_ERROR(iree_hal_module_create(
       vm_instance(), /*device_count=*/1, &hal_device, IREE_HAL_MODULE_FLAG_NONE,
-      host_allocator(), &modules.back()));
+      iree_hal_module_debug_sink_stdio(stderr), host_allocator(),
+      &modules.back()));
 
   // Main module.
   modules.push_back(main_module);
diff --git a/runtime/bindings/python/hal.cc b/runtime/bindings/python/hal.cc
index 61bd8a3..7a0e0cd 100644
--- a/runtime/bindings/python/hal.cc
+++ b/runtime/bindings/python/hal.cc
@@ -1097,10 +1097,12 @@
     devices_ptr = devices_vector.data();
     device_count = devices_vector.size();
   }
-  CheckApiStatus(iree_hal_module_create(instance->raw_ptr(), device_count,
-                                        devices_ptr, IREE_HAL_MODULE_FLAG_NONE,
-                                        iree_allocator_system(), &module),
-                 "Error creating hal module");
+  CheckApiStatus(
+      iree_hal_module_create(instance->raw_ptr(), device_count, devices_ptr,
+                             IREE_HAL_MODULE_FLAG_NONE,
+                             iree_hal_module_debug_sink_stdio(stderr),
+                             iree_allocator_system(), &module),
+      "Error creating hal module");
   return VmModule::StealFromRawPtr(module);
 }
 
diff --git a/runtime/bindings/tflite/interpreter.c b/runtime/bindings/tflite/interpreter.c
index 6f0120b..4a9b474 100644
--- a/runtime/bindings/tflite/interpreter.c
+++ b/runtime/bindings/tflite/interpreter.c
@@ -60,10 +60,10 @@
       "failed creating the default device for driver '%.*s'",
       (int)driver_name.size, driver_name.data);
 
-  IREE_RETURN_IF_ERROR(
-      iree_hal_module_create(interpreter->instance, /*device_count=*/1,
-                             &interpreter->device, IREE_HAL_MODULE_FLAG_NONE,
-                             interpreter->allocator, &interpreter->hal_module));
+  IREE_RETURN_IF_ERROR(iree_hal_module_create(
+      interpreter->instance, /*device_count=*/1, &interpreter->device,
+      IREE_HAL_MODULE_FLAG_NONE, iree_hal_module_debug_sink_stdio(stderr),
+      interpreter->allocator, &interpreter->hal_module));
 
   return iree_ok_status();
 }
diff --git a/runtime/src/iree/base/config.h b/runtime/src/iree/base/config.h
index ced46d9..1c87930 100644
--- a/runtime/src/iree/base/config.h
+++ b/runtime/src/iree/base/config.h
@@ -144,7 +144,7 @@
 // File I/O
 //===----------------------------------------------------------------------===//
 // On platforms without file systems or in applications where no file I/O
-// utilties are used, all file I/O operations can be stripped out. Functions
+// utilities are used, all file I/O operations can be stripped out. Functions
 // relying on file I/O will still be defined, but they will return errors.
 
 #if !defined(IREE_FILE_IO_ENABLE)
diff --git a/runtime/src/iree/modules/check/check_test.cc b/runtime/src/iree/modules/check/check_test.cc
index e6afaee..d5e1c7b 100644
--- a/runtime/src/iree/modules/check/check_test.cc
+++ b/runtime/src/iree/modules/check/check_test.cc
@@ -46,7 +46,8 @@
         hal_driver, iree_allocator_system(), &device_));
     IREE_ASSERT_OK(iree_hal_module_create(
         instance_, /*device_count=*/1, &device_, IREE_HAL_MODULE_FLAG_NONE,
-        iree_allocator_system(), &hal_module_));
+        iree_hal_module_debug_sink_stdio(stderr), iree_allocator_system(),
+        &hal_module_));
     iree_hal_driver_release(hal_driver);
 
     IREE_ASSERT_OK(iree_check_module_create(instance_, iree_allocator_system(),
diff --git a/runtime/src/iree/modules/hal/BUILD.bazel b/runtime/src/iree/modules/hal/BUILD.bazel
index c1c6c56..459e6ac 100644
--- a/runtime/src/iree/modules/hal/BUILD.bazel
+++ b/runtime/src/iree/modules/hal/BUILD.bazel
@@ -13,6 +13,17 @@
 )
 
 iree_runtime_cc_library(
+    name = "debugging",
+    srcs = ["debugging.c"],
+    hdrs = ["debugging.h"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/vm",
+    ],
+)
+
+iree_runtime_cc_library(
     name = "hal",
     srcs = [
         "module.c",
@@ -24,6 +35,7 @@
         "exports.inl",
     ],
     deps = [
+        ":debugging",
         ":types",
         "//runtime/src/iree/base",
         "//runtime/src/iree/hal",
diff --git a/runtime/src/iree/modules/hal/CMakeLists.txt b/runtime/src/iree/modules/hal/CMakeLists.txt
index 8e68384..6725ad6 100644
--- a/runtime/src/iree/modules/hal/CMakeLists.txt
+++ b/runtime/src/iree/modules/hal/CMakeLists.txt
@@ -12,6 +12,20 @@
 
 iree_cc_library(
   NAME
+    debugging
+  HDRS
+    "debugging.h"
+  SRCS
+    "debugging.c"
+  DEPS
+    iree::base
+    iree::hal
+    iree::vm
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
     hal
   HDRS
     "module.h"
@@ -20,6 +34,7 @@
   SRCS
     "module.c"
   DEPS
+    ::debugging
     ::types
     iree::base
     iree::hal
diff --git a/runtime/src/iree/modules/hal/debugging.c b/runtime/src/iree/modules/hal/debugging.c
new file mode 100644
index 0000000..fbb9b60
--- /dev/null
+++ b/runtime/src/iree/modules/hal/debugging.c
@@ -0,0 +1,74 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/modules/hal/debugging.h"
+
+//===----------------------------------------------------------------------===//
+// Debug Sink
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT iree_hal_module_debug_sink_t
+iree_hal_module_debug_sink_null(void) {
+  iree_hal_module_debug_sink_t sink = {0};
+  return sink;
+}
+
+#if defined(IREE_FILE_IO_ENABLE)
+
+#if IREE_HAL_MODULE_STRING_UTIL_ENABLE
+static iree_status_t iree_hal_module_buffer_view_trace_stdio(
+    void* user_data, iree_string_view_t key, iree_host_size_t buffer_view_count,
+    iree_hal_buffer_view_t** buffer_views, iree_allocator_t host_allocator) {
+  FILE* file = (FILE*)user_data;
+
+  fprintf(file, "=== %.*s ===\n", (int)key.size, key.data);
+  for (iree_host_size_t i = 0; i < buffer_view_count; ++i) {
+    iree_hal_buffer_view_t* buffer_view = buffer_views[i];
+
+    // NOTE: this export is for debugging only and a no-op in min-size builds.
+    // We heap-alloc here because at the point this export is used performance
+    // is not a concern.
+
+    // Query total length (excluding NUL terminator).
+    iree_host_size_t result_length = 0;
+    iree_status_t status = iree_hal_buffer_view_format(
+        buffer_view, IREE_HOST_SIZE_MAX, 0, NULL, &result_length);
+    if (!iree_status_is_out_of_range(status)) {
+      return status;
+    }
+    ++result_length;  // include NUL
+
+    // Allocate scratch heap memory to contain the result and format into it.
+    char* result_str = NULL;
+    IREE_RETURN_IF_ERROR(iree_allocator_malloc(host_allocator, result_length,
+                                               (void**)&result_str));
+    status =
+        iree_hal_buffer_view_format(buffer_view, IREE_HOST_SIZE_MAX,
+                                    result_length, result_str, &result_length);
+    if (iree_status_is_ok(status)) {
+      fprintf(file, "%.*s\n", (int)result_length, result_str);
+    }
+    iree_allocator_free(host_allocator, result_str);
+    IREE_RETURN_IF_ERROR(status);
+  }
+  fprintf(file, "\n");
+  return iree_ok_status();
+}
+#endif  // IREE_HAL_MODULE_STRING_UTIL_ENABLE
+
+IREE_API_EXPORT iree_hal_module_debug_sink_t
+iree_hal_module_debug_sink_stdio(FILE* file) {
+  iree_hal_module_debug_sink_t sink = {0};
+
+#if IREE_HAL_MODULE_STRING_UTIL_ENABLE
+  sink.buffer_view_trace.fn = iree_hal_module_buffer_view_trace_stdio;
+  sink.buffer_view_trace.user_data = file;
+#endif  // IREE_HAL_MODULE_STRING_UTIL_ENABLE
+
+  return sink;
+}
+
+#endif  // IREE_FILE_IO_ENABLE
diff --git a/runtime/src/iree/modules/hal/debugging.h b/runtime/src/iree/modules/hal/debugging.h
new file mode 100644
index 0000000..7500f2c
--- /dev/null
+++ b/runtime/src/iree/modules/hal/debugging.h
@@ -0,0 +1,61 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_MODULES_HAL_DEBUGGING_H_
+#define IREE_MODULES_HAL_DEBUGGING_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/vm/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//===----------------------------------------------------------------------===//
+// Debug Sink
+//===----------------------------------------------------------------------===//
+
+// Receives a
+typedef iree_status_t(
+    IREE_API_PTR* iree_hal_module_buffer_view_trace_callback_fn_t)(
+    void* user_data, iree_string_view_t key, iree_host_size_t buffer_view_count,
+    iree_hal_buffer_view_t** buffer_views, iree_allocator_t host_allocator);
+
+typedef struct iree_hal_module_buffer_view_trace_callback_t {
+  iree_hal_module_buffer_view_trace_callback_fn_t fn;
+  void* user_data;
+} iree_hal_module_buffer_view_trace_callback_t;
+
+// Interface for a HAL module debug event sink.
+// Any referenced user data must remain live for the lifetime of the HAL module
+// the sink is provided to.
+typedef struct iree_hal_module_debug_sink_t {
+  // Called on each hal.buffer_view.trace.
+  iree_hal_module_buffer_view_trace_callback_t buffer_view_trace;
+} iree_hal_module_debug_sink_t;
+
+// Returns a default debug sink that outputs nothing.
+IREE_API_EXPORT iree_hal_module_debug_sink_t
+iree_hal_module_debug_sink_null(void);
+
+#if defined(IREE_FILE_IO_ENABLE)
+
+// Returns a default debug sink that routes to an stdio stream in textual form.
+IREE_API_EXPORT iree_hal_module_debug_sink_t
+iree_hal_module_debug_sink_stdio(FILE* file);
+
+#else
+
+#define iree_hal_module_debug_sink_stdio(file) iree_hal_module_debug_sink_null()
+
+#endif  // IREE_FILE_IO_ENABLE
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_MODULES_HAL_DEBUGGING_H_
diff --git a/runtime/src/iree/modules/hal/inline/BUILD.bazel b/runtime/src/iree/modules/hal/inline/BUILD.bazel
index 9d80890..ee0e934 100644
--- a/runtime/src/iree/modules/hal/inline/BUILD.bazel
+++ b/runtime/src/iree/modules/hal/inline/BUILD.bazel
@@ -27,6 +27,7 @@
         "//runtime/src/iree/base",
         "//runtime/src/iree/base/internal:cpu",
         "//runtime/src/iree/hal",
+        "//runtime/src/iree/modules/hal:debugging",
         "//runtime/src/iree/modules/hal:types",
         "//runtime/src/iree/modules/hal/utils:buffer_diagnostics",
         "//runtime/src/iree/vm",
diff --git a/runtime/src/iree/modules/hal/inline/CMakeLists.txt b/runtime/src/iree/modules/hal/inline/CMakeLists.txt
index 78b00aa..24dbc06 100644
--- a/runtime/src/iree/modules/hal/inline/CMakeLists.txt
+++ b/runtime/src/iree/modules/hal/inline/CMakeLists.txt
@@ -23,6 +23,7 @@
     iree::base
     iree::base::internal::cpu
     iree::hal
+    iree::modules::hal::debugging
     iree::modules::hal::types
     iree::modules::hal::utils::buffer_diagnostics
     iree::vm
diff --git a/runtime/src/iree/modules/hal/inline/module.c b/runtime/src/iree/modules/hal/inline/module.c
index cd1dfdb..c3c5dea 100644
--- a/runtime/src/iree/modules/hal/inline/module.c
+++ b/runtime/src/iree/modules/hal/inline/module.c
@@ -136,6 +136,7 @@
   iree_allocator_t host_allocator;
   iree_hal_allocator_t* device_allocator;
   iree_hal_inline_module_flags_t flags;
+  iree_hal_module_debug_sink_t debug_sink;
   // TODO(benvanik): types.
 } iree_hal_inline_module_t;
 
@@ -147,6 +148,7 @@
   iree_allocator_t host_allocator;
   iree_hal_allocator_t* device_allocator;
   iree_hal_inline_module_flags_t flags;
+  iree_hal_module_debug_sink_t debug_sink;
 } iree_hal_inline_module_state_t;
 
 static void IREE_API_PTR iree_hal_inline_module_destroy(void* base_module) {
@@ -170,6 +172,7 @@
   state->device_allocator = module->device_allocator;
   iree_hal_allocator_retain(state->device_allocator);
   state->flags = module->flags;
+  state->debug_sink = module->debug_sink;
 
   *out_module_state = (iree_vm_module_state_t*)state;
   IREE_TRACE_ZONE_END(z0);
@@ -507,8 +510,26 @@
 IREE_VM_ABI_EXPORT(iree_hal_inline_module_buffer_view_trace,  //
                    iree_hal_inline_module_state_t,            //
                    rCrD, v) {
-  return iree_hal_modules_buffer_view_trace(args->r0, args->a1_count, args->a1,
-                                            state->host_allocator);
+  if (state->debug_sink.buffer_view_trace.fn) {
+    iree_vm_buffer_t* key = NULL;
+    IREE_RETURN_IF_ERROR(iree_vm_buffer_check_deref(args->r0, &key));
+    iree_string_view_t key_str = iree_vm_buffer_as_string(key);
+    iree_host_size_t buffer_view_count = (iree_host_size_t)args->a1_count;
+    if (buffer_view_count > 128) {
+      return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                              "too many buffer views for a single trace call");
+    }
+    iree_hal_buffer_view_t** buffer_views =
+        iree_alloca(buffer_view_count * sizeof(iree_hal_buffer_view_t*));
+    for (iree_host_size_t i = 0; i < buffer_view_count; ++i) {
+      IREE_RETURN_IF_ERROR(
+          iree_hal_buffer_view_check_deref(args->a1[i].r0, &buffer_views[i]));
+    }
+    return state->debug_sink.buffer_view_trace.fn(
+        state->debug_sink.buffer_view_trace.user_data, key_str,
+        buffer_view_count, buffer_views, state->host_allocator);
+  }
+  return iree_ok_status();
 }
 
 //===----------------------------------------------------------------------===//
@@ -598,6 +619,7 @@
 
 IREE_API_EXPORT iree_status_t iree_hal_inline_module_create(
     iree_vm_instance_t* instance, iree_hal_inline_module_flags_t flags,
+    iree_hal_module_debug_sink_t debug_sink,
     iree_hal_allocator_t* device_allocator, iree_allocator_t host_allocator,
     iree_vm_module_t** out_module) {
   IREE_ASSERT_ARGUMENT(instance);
@@ -635,6 +657,7 @@
   module->device_allocator = device_allocator;
   iree_hal_allocator_retain(module->device_allocator);
   module->flags = flags;
+  module->debug_sink = debug_sink;
 
   *out_module = base_module;
   return iree_ok_status();
diff --git a/runtime/src/iree/modules/hal/inline/module.h b/runtime/src/iree/modules/hal/inline/module.h
index f8e881d..be6954b 100644
--- a/runtime/src/iree/modules/hal/inline/module.h
+++ b/runtime/src/iree/modules/hal/inline/module.h
@@ -11,6 +11,7 @@
 
 #include "iree/base/api.h"
 #include "iree/hal/api.h"
+#include "iree/modules/hal/debugging.h"
 #include "iree/modules/hal/types.h"
 #include "iree/vm/api.h"
 
@@ -29,6 +30,7 @@
 // allocations.
 IREE_API_EXPORT iree_status_t iree_hal_inline_module_create(
     iree_vm_instance_t* instance, iree_hal_inline_module_flags_t flags,
+    iree_hal_module_debug_sink_t debug_sink,
     iree_hal_allocator_t* device_allocator, iree_allocator_t host_allocator,
     iree_vm_module_t** out_module);
 
diff --git a/runtime/src/iree/modules/hal/module.c b/runtime/src/iree/modules/hal/module.c
index 9b808fc..38a95c6 100644
--- a/runtime/src/iree/modules/hal/module.c
+++ b/runtime/src/iree/modules/hal/module.c
@@ -38,6 +38,7 @@
 typedef struct iree_hal_module_t {
   iree_allocator_t host_allocator;
   iree_hal_module_flags_t flags;
+  iree_hal_module_debug_sink_t debug_sink;
   iree_host_size_t device_count;
   iree_hal_device_t* devices[];
 } iree_hal_module_t;
@@ -59,6 +60,9 @@
   // application. All instantiations of a module share the same flags.
   iree_hal_module_flags_t flags;
 
+  // Debug sink for routing debug events.
+  iree_hal_module_debug_sink_t debug_sink;
+
   // Total number of devices available to the module.
   iree_host_size_t device_count;
   // Devices referencing the storage in the parent module.
@@ -94,6 +98,7 @@
   memset(state, 0, total_size);
   state->host_allocator = host_allocator;
   state->flags = module->flags;
+  state->debug_sink = module->debug_sink;
   state->device_count = module->device_count;
   state->devices = module->devices;
   state->loop_status = iree_ok_status();
@@ -630,8 +635,27 @@
 IREE_VM_ABI_EXPORT(iree_hal_module_buffer_view_trace,  //
                    iree_hal_module_state_t,            //
                    rCrD, v) {
-  return iree_hal_modules_buffer_view_trace(args->r0, args->a1_count, args->a1,
-                                            state->host_allocator);
+  if (state->debug_sink.buffer_view_trace.fn) {
+    iree_vm_buffer_t* key = NULL;
+    IREE_RETURN_IF_ERROR(iree_vm_buffer_check_deref(args->r0, &key));
+    iree_string_view_t key_str = iree_vm_buffer_as_string(key);
+    iree_host_size_t buffer_view_count =
+        iree_hal_cast_host_size(args->a1_count);
+    if (buffer_view_count > 128) {
+      return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                              "too many buffer views for a single trace call");
+    }
+    iree_hal_buffer_view_t** buffer_views =
+        iree_alloca(buffer_view_count * sizeof(iree_hal_buffer_view_t*));
+    for (iree_host_size_t i = 0; i < buffer_view_count; ++i) {
+      IREE_RETURN_IF_ERROR(
+          iree_hal_buffer_view_check_deref(args->a1[i].r0, &buffer_views[i]));
+    }
+    return state->debug_sink.buffer_view_trace.fn(
+        state->debug_sink.buffer_view_trace.user_data, key_str,
+        buffer_view_count, buffer_views, state->host_allocator);
+  }
+  return iree_ok_status();
 }
 
 //===----------------------------------------------------------------------===//
@@ -1778,7 +1802,8 @@
 IREE_API_EXPORT iree_status_t iree_hal_module_create(
     iree_vm_instance_t* instance, iree_host_size_t device_count,
     iree_hal_device_t** devices, iree_hal_module_flags_t flags,
-    iree_allocator_t host_allocator, iree_vm_module_t** out_module) {
+    iree_hal_module_debug_sink_t debug_sink, iree_allocator_t host_allocator,
+    iree_vm_module_t** out_module) {
   IREE_ASSERT_ARGUMENT(instance);
   IREE_ASSERT_ARGUMENT(device_count);
   IREE_ASSERT_ARGUMENT(devices);
@@ -1817,6 +1842,7 @@
   module->host_allocator = host_allocator;
   // TODO(benvanik): fix vm yield with result storage.
   module->flags = flags | IREE_HAL_MODULE_FLAG_SYNCHRONOUS;
+  module->debug_sink = debug_sink;
   module->device_count = device_count;
   for (iree_host_size_t i = 0; i < device_count; ++i) {
     module->devices[i] = devices[i];
diff --git a/runtime/src/iree/modules/hal/module.h b/runtime/src/iree/modules/hal/module.h
index ee45570..92b32d2 100644
--- a/runtime/src/iree/modules/hal/module.h
+++ b/runtime/src/iree/modules/hal/module.h
@@ -11,6 +11,7 @@
 
 #include "iree/base/api.h"
 #include "iree/hal/api.h"
+#include "iree/modules/hal/debugging.h"
 #include "iree/modules/hal/types.h"
 #include "iree/vm/api.h"
 
@@ -32,7 +33,8 @@
 IREE_API_EXPORT iree_status_t iree_hal_module_create(
     iree_vm_instance_t* instance, iree_host_size_t device_count,
     iree_hal_device_t** devices, iree_hal_module_flags_t flags,
-    iree_allocator_t host_allocator, iree_vm_module_t** out_module);
+    iree_hal_module_debug_sink_t debug_sink, iree_allocator_t host_allocator,
+    iree_vm_module_t** out_module);
 
 // Returns the total number of available devices registered with the HAL module.
 IREE_API_EXPORT iree_host_size_t
diff --git a/runtime/src/iree/modules/hal/utils/buffer_diagnostics.c b/runtime/src/iree/modules/hal/utils/buffer_diagnostics.c
index 1dc51ef..cb66e39 100644
--- a/runtime/src/iree/modules/hal/utils/buffer_diagnostics.c
+++ b/runtime/src/iree/modules/hal/utils/buffer_diagnostics.c
@@ -249,50 +249,3 @@
 
   return shape_status;
 }
-
-iree_status_t iree_hal_modules_buffer_view_trace(
-    iree_vm_ref_t key_ref, iree_vm_size_t buffer_view_count,
-    iree_vm_abi_r_t* buffer_view_refs, iree_allocator_t host_allocator) {
-#if IREE_HAL_MODULE_STRING_UTIL_ENABLE
-
-  iree_vm_buffer_t* key = NULL;
-  IREE_RETURN_IF_ERROR(iree_vm_buffer_check_deref(key_ref, &key));
-  iree_string_view_t key_str = iree_vm_buffer_as_string(key);
-
-  fprintf(stderr, "=== %.*s ===\n", (int)key_str.size, key_str.data);
-  for (iree_host_size_t i = 0; i < buffer_view_count; ++i) {
-    iree_hal_buffer_view_t* buffer_view = NULL;
-    IREE_RETURN_IF_ERROR(
-        iree_hal_buffer_view_check_deref(buffer_view_refs[i].r0, &buffer_view));
-
-    // NOTE: this export is for debugging only and a no-op in min-size builds.
-    // We heap-alloc here because at the point this export is used performance
-    // is not a concern.
-
-    // Query total length (excluding NUL terminator).
-    iree_host_size_t result_length = 0;
-    iree_status_t status = iree_hal_buffer_view_format(
-        buffer_view, IREE_HOST_SIZE_MAX, 0, NULL, &result_length);
-    if (!iree_status_is_out_of_range(status)) {
-      return status;
-    }
-    ++result_length;  // include NUL
-
-    // Allocate scratch heap memory to contain the result and format into it.
-    char* result_str = NULL;
-    IREE_RETURN_IF_ERROR(iree_allocator_malloc(host_allocator, result_length,
-                                               (void**)&result_str));
-    status =
-        iree_hal_buffer_view_format(buffer_view, IREE_HOST_SIZE_MAX,
-                                    result_length, result_str, &result_length);
-    if (iree_status_is_ok(status)) {
-      fprintf(stderr, "%.*s\n", (int)result_length, result_str);
-    }
-    iree_allocator_free(host_allocator, result_str);
-    IREE_RETURN_IF_ERROR(status);
-  }
-  fprintf(stderr, "\n");
-
-#endif  // IREE_HAL_MODULE_STRING_UTIL_ENABLE
-  return iree_ok_status();
-}
diff --git a/runtime/src/iree/modules/hal/utils/buffer_diagnostics.h b/runtime/src/iree/modules/hal/utils/buffer_diagnostics.h
index 8c03b22..b025f26 100644
--- a/runtime/src/iree/modules/hal/utils/buffer_diagnostics.h
+++ b/runtime/src/iree/modules/hal/utils/buffer_diagnostics.h
@@ -35,8 +35,4 @@
     iree_host_size_t expected_shape_rank,
     const iree_hal_dim_t* expected_shape_dims);
 
-iree_status_t iree_hal_modules_buffer_view_trace(
-    iree_vm_ref_t key_ref, iree_vm_size_t buffer_view_count,
-    iree_vm_abi_r_t* buffer_view_refs, iree_allocator_t host_allocator);
-
 #endif  // IREE_MODULES_HAL_UTILS_BUFFER_DIAGNOSTICS_H_
diff --git a/runtime/src/iree/runtime/session.c b/runtime/src/iree/runtime/session.c
index 4286178..0c89a12 100644
--- a/runtime/src/iree/runtime/session.c
+++ b/runtime/src/iree/runtime/session.c
@@ -93,10 +93,10 @@
   // Lower-level usage of the VM can avoid the HAL if it's not required.
   iree_vm_module_t* hal_module = NULL;
   if (iree_status_is_ok(status)) {
-    status = iree_hal_module_create(iree_runtime_instance_vm_instance(instance),
-                                    /*device_count=*/1, &device,
-                                    IREE_HAL_MODULE_FLAG_NONE, host_allocator,
-                                    &hal_module);
+    status = iree_hal_module_create(
+        iree_runtime_instance_vm_instance(instance),
+        /*device_count=*/1, &device, IREE_HAL_MODULE_FLAG_NONE,
+        iree_hal_module_debug_sink_stdio(stderr), host_allocator, &hal_module);
   }
   if (iree_status_is_ok(status)) {
     status = iree_vm_context_register_modules(
diff --git a/runtime/src/iree/tooling/context_util.c b/runtime/src/iree/tooling/context_util.c
index 5ec1028..f088ee8 100644
--- a/runtime/src/iree/tooling/context_util.c
+++ b/runtime/src/iree/tooling/context_util.c
@@ -218,9 +218,9 @@
   // Create HAL module wrapping the device created above.
   iree_hal_module_flags_t flags = IREE_HAL_MODULE_FLAG_NONE;
   iree_vm_module_t* module = NULL;
-  iree_status_t status =
-      iree_hal_module_create(instance, device_list->count, device_list->devices,
-                             flags, host_allocator, &module);
+  iree_status_t status = iree_hal_module_create(
+      instance, device_list->count, device_list->devices, flags,
+      iree_hal_module_debug_sink_stdio(stderr), host_allocator, &module);
 
   iree_hal_device_list_free(device_list);
 
@@ -280,7 +280,8 @@
   iree_hal_inline_module_flags_t flags = IREE_HAL_INLINE_MODULE_FLAG_NONE;
   iree_vm_module_t* module = NULL;
   iree_status_t status = iree_hal_inline_module_create(
-      instance, flags, device_allocator, host_allocator, &module);
+      instance, flags, iree_hal_module_debug_sink_stdio(stderr),
+      device_allocator, host_allocator, &module);
 
   if (iree_status_is_ok(status)) {
     *out_module = module;
diff --git a/samples/simple_embedding/simple_embedding.c b/samples/simple_embedding/simple_embedding.c
index b5df80b..bd3fd69 100644
--- a/samples/simple_embedding/simple_embedding.c
+++ b/samples/simple_embedding/simple_embedding.c
@@ -42,7 +42,8 @@
   iree_vm_module_t* hal_module = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_module_create(
       instance, /*device_count=*/1, &device, IREE_HAL_MODULE_FLAG_SYNCHRONOUS,
-      iree_allocator_system(), &hal_module));
+      iree_hal_module_debug_sink_stdio(stderr), iree_allocator_system(),
+      &hal_module));
 
   // Load bytecode module from the embedded data.
   const iree_const_byte_span_t module_data = load_bytecode_module_data();