Adding custom_module/sync/ & custom_module/async/ samples.
These show how to interact with tensor I/O when using either
synchronous or asynchronous custom module calls.
diff --git a/samples/custom_module/sync/CMakeLists.txt b/samples/custom_module/sync/CMakeLists.txt
new file mode 100644
index 0000000..9c52a8a
--- /dev/null
+++ b/samples/custom_module/sync/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Sample requires the llvm-cpu compiler backend and the local-sync runtime
+# driver. This could be made to work with other backends.
+if(NOT IREE_TARGET_BACKEND_LLVM_CPU OR
+   NOT IREE_HAL_DRIVER_LOCAL_SYNC)
+  return()
+endif()
+
+set(_NAME "iree_samples_custom_module_sync_run")
+add_executable(${_NAME} "")
+target_sources(${_NAME}
+  PRIVATE
+    main.c
+    module.cc
+    module.h
+)
+
+set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "custom-module-sync-run")
+
+# TODO(benvanik): make iree_status_annotate_f always available as a function
+# instead of defining it empty? otherwise optimized builds of the runtime won't
+# export it but external libraries may pull it in.
+target_compile_options(${_NAME} PRIVATE ${IREE_DEFAULT_COPTS})
+
+target_link_libraries(${_NAME}
+  iree_runtime_runtime
+)
+
+add_subdirectory(test)
diff --git a/samples/custom_module/sync/README.md b/samples/custom_module/sync/README.md
new file mode 100644
index 0000000..96f8616
--- /dev/null
+++ b/samples/custom_module/sync/README.md
@@ -0,0 +1,45 @@
+# Synchronous tensor I/O custom module sample
+
+This sample expects that you've already produced a working version of the
+[basic sample](/samples/custom_module/basic/) (including compiler installation
+and CMake setup).
+
+This sample demonstrates adding custom modules callable from compiler-produced
+programs that take and return `tensor` types. By default custom calls are
+treated as blocking operations that synchronize with the underlying device to
+ensure all passed `tensor` buffer views are host coherent and it's assumed that
+any returned `tensor` buffer views are ready for use when the call returns.
+
+This approach is the easiest to integrate and looks similar to classic ML
+frameworks custom calls. There are many significant performance implications of
+using this approach, though, and synchronous calls should only be used when
+no asynchronous approach is possible. See the
+[async tensor](/samples/custom_module/async/) sample for how to define
+custom calls that work asynchronously.
+
+## Instructions
+
+1. Compile the [example module](./test/example.mlir) to a .vmfb file:
+
+    ```
+    iree-compile --iree-hal-target-backends=llvm-cpu samples/custom_module/sync/test/example.mlir -o=/tmp/example.vmfb
+    ```
+
+2. Build the `iree_samples_custom_module_sync_run` CMake target :
+
+    ```
+    cmake -B ../iree-build/ -DCMAKE_BUILD_TYPE=RelWithDebInfo . \
+        -DCMAKE_C_FLAGS=-DIREE_VM_EXECUTION_TRACING_FORCE_ENABLE=1
+    cmake --build ../iree-build/ --target iree_samples_custom_module_sync_run
+    ```
+    (here we force runtime execution tracing for demonstration purposes)
+
+    [See here](https://iree-org.github.io/iree/building-from-source/getting-started/)
+    for general instructions on building using CMake.
+
+3. Run the example program to call the main function:
+
+   ```
+   ../iree-build/samples/custom_module/sync/custom-module-sync-run \
+       /tmp/example.vmfb example.main
+   ```
diff --git a/samples/custom_module/sync/main.c b/samples/custom_module/sync/main.c
new file mode 100644
index 0000000..4095dfe
--- /dev/null
+++ b/samples/custom_module/sync/main.c
@@ -0,0 +1,144 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdio.h>
+
+// IREE APIs:
+#include "iree/modules/hal/types.h"
+#include "iree/runtime/api.h"
+
+// Custom native module used in the sample.
+// Modules may be linked in from native code or other bytecode modules loaded at
+// runtime: there's no difference.
+#include "module.h"
+
+// NOTE: CHECKs are dangerous but this is a sample; a real application would
+// want to handle errors gracefully. We know in this constrained case that
+// these won't fail unless something is catastrophically wrong (out of memory,
+// solar flares, etc).
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    fprintf(stderr,
+            "Usage:\n"
+            "  custom-module-sync-run - <entry.point> # read from stdin\n"
+            "  custom-module-sync-run </path/to/say_hello.vmfb> "
+            "<entry.point>\n");
+    fprintf(stderr, "  (See the README for this sample for details)\n ");
+    return -1;
+  }
+
+  // Internally IREE does not (in general) use malloc and instead uses the
+  // provided allocator to allocate and free memory. Applications can integrate
+  // their own allocator as-needed.
+  iree_allocator_t host_allocator = iree_allocator_system();
+
+  // Create and configure the instance shared across all sessions.
+  iree_runtime_instance_options_t instance_options;
+  iree_runtime_instance_options_initialize(&instance_options);
+  iree_runtime_instance_options_use_all_available_drivers(&instance_options);
+  iree_runtime_instance_t* instance = NULL;
+  IREE_CHECK_OK(iree_runtime_instance_create(&instance_options, host_allocator,
+                                             &instance));
+
+  // Try to create the device - it should always succeed as it's a CPU device.
+  iree_hal_device_t* device = NULL;
+  IREE_CHECK_OK(iree_runtime_instance_try_create_default_device(
+      instance, iree_make_cstring_view("local-sync"), &device));
+
+  // Create one session per loaded module to hold the module state.
+  iree_runtime_session_options_t session_options;
+  iree_runtime_session_options_initialize(&session_options);
+  iree_runtime_session_t* session = NULL;
+  IREE_CHECK_OK(iree_runtime_session_create_with_device(
+      instance, &session_options, device,
+      iree_runtime_instance_host_allocator(instance), &session));
+
+  // Create the custom module that can be reused across contexts.
+  iree_vm_module_t* custom_module = NULL;
+  IREE_CHECK_OK(iree_custom_module_sync_create(
+      iree_runtime_instance_vm_instance(instance), device, host_allocator,
+      &custom_module));
+  IREE_CHECK_OK(iree_runtime_session_append_module(session, custom_module));
+  iree_vm_module_release(custom_module);
+
+  // Load the module from stdin or a file on disk.
+  const char* module_path = argv[1];
+  if (strcmp(module_path, "-") == 0) {
+    IREE_CHECK_OK(
+        iree_runtime_session_append_bytecode_module_from_stdin(session));
+  } else {
+    IREE_CHECK_OK(iree_runtime_session_append_bytecode_module_from_file(
+        session, module_path));
+  }
+
+  iree_string_view_t entry_point = iree_make_cstring_view(argv[2]);
+  fprintf(stdout, "INVOKE BEGIN %.*s\n", (int)entry_point.size,
+          entry_point.data);
+  fflush(stdout);
+
+  iree_vm_list_t* inputs = NULL;
+  IREE_CHECK_OK(iree_vm_list_create(NULL, 1, host_allocator, &inputs));
+  iree_vm_list_t* outputs = NULL;
+  IREE_CHECK_OK(iree_vm_list_create(NULL, 1, host_allocator, &outputs));
+
+  // Pass in the tensor<?xi32> arg:
+  const int32_t input_data[5] = {1, 2, 3, 4, 5};
+  const iree_hal_dim_t shape[1] = {IREE_ARRAYSIZE(input_data)};
+  iree_hal_buffer_view_t* input_view = NULL;
+  IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer(
+      iree_runtime_session_device_allocator(session), IREE_ARRAYSIZE(shape),
+      shape, IREE_HAL_ELEMENT_TYPE_INT_32,
+      IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR,
+      (iree_hal_buffer_params_t){
+          .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
+          .access = IREE_HAL_MEMORY_ACCESS_READ,
+          .usage = IREE_HAL_BUFFER_USAGE_DEFAULT,
+      },
+      iree_make_const_byte_span(input_data, sizeof(input_data)), &input_view));
+  iree_vm_ref_t input_view_ref = iree_hal_buffer_view_move_ref(input_view);
+  IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs, &input_view_ref));
+
+  // Synchronously invoke the requested function.
+  IREE_CHECK_OK(
+      iree_runtime_session_call_by_name(session, entry_point, inputs, outputs));
+
+  // Read back the tensor<?xi32> result:
+  iree_hal_buffer_view_t* output_view =
+      iree_vm_list_get_buffer_view_assign(outputs, 0);
+  int32_t output_data[5] = {0};
+  IREE_CHECK_OK(
+      iree_hal_buffer_map_read(iree_hal_buffer_view_buffer(output_view), 0,
+                               output_data, sizeof(output_data)));
+
+  // Expecting (e^2 * 2)^2:
+  bool did_match = true;
+  for (size_t i = 0; i < IREE_ARRAYSIZE(input_data); ++i) {
+    int32_t t0 = input_data[i];
+    int32_t t1 = t0 * t0;
+    int32_t t2 = t1 * 2;
+    int32_t t3 = t2 * t2;
+    if (t3 != output_data[i]) {
+      fprintf(stdout, "MISMATCH [%zu] expected %d but actual %d\n", i, t3,
+              output_data[i]);
+      did_match = false;
+      break;
+    }
+  }
+  if (did_match) {
+    fprintf(stdout, "MATCHED!\n");
+  }
+
+  iree_vm_list_release(inputs);
+  iree_vm_list_release(outputs);
+
+  fprintf(stdout, "INVOKE END\n");
+  fflush(stdout);
+
+  iree_runtime_session_release(session);
+  iree_hal_device_release(device);
+  iree_runtime_instance_release(instance);
+  return 0;
+}
diff --git a/samples/custom_module/sync/module.cc b/samples/custom_module/sync/module.cc
new file mode 100644
index 0000000..215088d
--- /dev/null
+++ b/samples/custom_module/sync/module.cc
@@ -0,0 +1,195 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "module.h"
+
+#include <cstdio>
+#include <thread>
+
+#include "iree/modules/hal/types.h"
+#include "iree/vm/native_module_cc.h"
+
+// NOTE: this module is written in C++ using the native module wrapper and uses
+// template magic to handle marshaling arguments. For a lot of uses this is a
+// much friendlier way of exposing modules to the IREE VM and if performance and
+// code size are not a concern is a fine route to take. Here we do it for
+// brevity but all of the internal IREE modules are implemented in C.
+
+//===----------------------------------------------------------------------===//
+// VM module interface implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+using namespace iree;
+
+// Approximation of some external library call that populates a buffer.
+// It's assumed that when this is called the |source_buffer| is available to
+// read and the |target_buffer| is available to write (no other readers exist).
+// This sample assumes that the buffers are mappable so we can do the work here
+// but they will not always be. APIs like iree_hal_allocator_import_buffer and
+// iree_hal_allocator_export_buffer can be used in some cases to avoid
+// potentially expensive operations but real applications that care about
+// performance would want to issue async transfer command buffers.
+//
+// Only use this as a reference for when synchronous behavior is absolutely
+// required (old-style blocking file IO/etc).
+static Status SyncSimulatedHostOpI32(iree_hal_buffer_t* source_buffer,
+                                     iree_hal_buffer_t* target_buffer,
+                                     iree_hal_dim_t count) {
+  Status status = OkStatus();
+
+  // Map the source and target buffers into host memory. Note that not all
+  // devices allow this but in this sample we assume they do.
+  iree_hal_buffer_mapping_t source_mapping = {{0}};
+  if (status.ok()) {
+    status = iree_hal_buffer_map_range(
+        source_buffer, IREE_HAL_MAPPING_MODE_SCOPED,
+        IREE_HAL_MEMORY_ACCESS_READ, 0, IREE_WHOLE_BUFFER, &source_mapping);
+  }
+  iree_hal_buffer_mapping_t target_mapping = {{0}};
+  if (status.ok()) {
+    status =
+        iree_hal_buffer_map_range(target_buffer, IREE_HAL_MAPPING_MODE_SCOPED,
+                                  IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE, 0,
+                                  IREE_WHOLE_BUFFER, &target_mapping);
+  }
+
+  // Sad slow host work. Whenever possible it's worth it to move these into the
+  // program so the IREE compiler can fuse and accelerate these operations.
+  if (status.ok()) {
+    const int32_t* source_ptr =
+        reinterpret_cast<const int32_t*>(source_mapping.contents.data);
+    int32_t* target_ptr =
+        reinterpret_cast<int32_t*>(target_mapping.contents.data);
+    for (iree_host_size_t i = 0; i < count; ++i) {
+      target_ptr[i] = source_ptr[i] * 2;
+    }
+  }
+
+  // We must unmap the buffers before they will be usable.
+  // Note that it's possible for these to fail in cases where the buffer
+  // required emulated mapping but on basic host-local devices like CPU assumed
+  // in this sample that should never happen.
+  iree_status_ignore(iree_hal_buffer_unmap_range(&source_mapping));
+  iree_status_ignore(iree_hal_buffer_unmap_range(&target_mapping));
+
+  return status;
+}
+
+// Per-context module state.
+class CustomModuleState final {
+ public:
+  explicit CustomModuleState(vm::ref<iree_hal_device_t> device,
+                             iree_allocator_t host_allocator)
+      : device_(std::move(device)), host_allocator_(host_allocator) {}
+  ~CustomModuleState() = default;
+
+  StatusOr<vm::ref<iree_hal_buffer_view_t>> CallSync(
+      const vm::ref<iree_hal_buffer_view_t> arg_view) {
+    // We can directly access the buffer here but only for reading.
+    // In the future it'll be possible to pass in-place buffers.
+    auto* arg_buffer = iree_hal_buffer_view_buffer(arg_view.get());
+
+    // Synchronously allocate the memory from the device allocator. We could
+    // use queue-ordered allocations but that's unsafe to use from arbitrary
+    // threads and we want to show how to safely do that using the thread-safe
+    // device allocator.
+    //
+    // NOTE: if cloning host memory the initial_data can be passed in to
+    // efficiently upload the memory to the device. If wrapping host memory then
+    // iree_hal_allocator_import_buffer can be used to import the memory without
+    // a copy (if supported). This simple example is showing an in-place style
+    // external call.
+    iree_hal_allocator_t* device_allocator =
+        iree_hal_device_allocator(device_.get());
+    iree_hal_buffer_params_t buffer_params = {
+        /*.usage=*/IREE_HAL_BUFFER_USAGE_DEFAULT |
+            IREE_HAL_BUFFER_USAGE_MAPPING,
+        /*.access=*/IREE_HAL_MEMORY_ACCESS_ALL,
+        /*.type=*/IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE |
+            IREE_HAL_MEMORY_TYPE_HOST_VISIBLE,
+        /*.queue_affinity=*/IREE_HAL_QUEUE_AFFINITY_ANY,
+        /*.min_alignment=*/64,
+    };
+    vm::ref<iree_hal_buffer_t> result_buffer;
+    IREE_RETURN_IF_ERROR(iree_hal_allocator_allocate_buffer(
+        device_allocator, buffer_params,
+        iree_hal_buffer_view_byte_length(arg_view.get()),
+        iree_const_byte_span_empty(), &result_buffer));
+
+    // Hacky example accessing the source contents and producing the result
+    // contents. This emulates what an external library the user is calling that
+    // expects host void* buffers does.
+    IREE_RETURN_IF_ERROR(SyncSimulatedHostOpI32(
+        arg_buffer, result_buffer.get(),
+        iree_hal_buffer_view_element_count(arg_view.get())));
+
+    // Wrap the buffer in a buffer view that provides the metadata for
+    // runtime verification.
+    vm::ref<iree_hal_buffer_view_t> result_view;
+    IREE_RETURN_IF_ERROR(iree_hal_buffer_view_create_like(
+        result_buffer.get(), arg_view.get(), host_allocator_, &result_view));
+
+    // Note that the caller may immediately use the buffer contents without
+    // waiting as by being synchronous we've indicated that we waited ourselves
+    // (the thread join above).
+    return result_view;
+  }
+
+ private:
+  // HAL device used for scheduling work and allocations.
+  vm::ref<iree_hal_device_t> device_;
+
+  // Allocator that the caller requested we use for any allocations we need to
+  // perform during operation.
+  iree_allocator_t host_allocator_;
+};
+
+// Function table mapping imported function names to their implementation.
+static const vm::NativeFunction<CustomModuleState> kCustomModuleFunctions[] = {
+    vm::MakeNativeFunction("call.sync", &CustomModuleState::CallSync),
+};
+
+// The module instance that will be allocated and reused across contexts.
+class CustomModule final : public vm::NativeModule<CustomModuleState> {
+ public:
+  using vm::NativeModule<CustomModuleState>::NativeModule;
+
+  void SetDevice(vm::ref<iree_hal_device_t> device) {
+    device_ = std::move(device);
+  }
+
+  // Creates per-context state when the module is added to a new context.
+  // May be called from any thread.
+  StatusOr<std::unique_ptr<CustomModuleState>> CreateState(
+      iree_allocator_t host_allocator) override {
+    auto state = std::make_unique<CustomModuleState>(vm::retain_ref(device_),
+                                                     host_allocator);
+    return state;
+  }
+
+ private:
+  vm::ref<iree_hal_device_t> device_;
+};
+
+}  // namespace
+
+// Note that while we are using C++ bindings internally we still expose the
+// module as a C instance. This hides the details of our implementation.
+extern "C" iree_status_t iree_custom_module_sync_create(
+    iree_vm_instance_t* instance, iree_hal_device_t* device,
+    iree_allocator_t host_allocator, iree_vm_module_t** out_module) {
+  IREE_ASSERT_ARGUMENT(out_module);
+  *out_module = NULL;
+  auto module = std::make_unique<CustomModule>(
+      "custom", /*version=*/0, instance, host_allocator,
+      iree::span<const vm::NativeFunction<CustomModuleState>>(
+          kCustomModuleFunctions));
+  module->SetDevice(vm::retain_ref(device));
+  *out_module = module.release()->interface();
+  return iree_ok_status();
+}
diff --git a/samples/custom_module/sync/module.h b/samples/custom_module/sync/module.h
new file mode 100644
index 0000000..7e293af
--- /dev/null
+++ b/samples/custom_module/sync/module.h
@@ -0,0 +1,34 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_SAMPLES_CUSTOM_MODULE_TENSOR_SYNC_MODULE_H_
+#define IREE_SAMPLES_CUSTOM_MODULE_TENSOR_SYNC_MODULE_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/vm/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a native custom module that can be reused in multiple contexts.
+// The module itself may hold state that can be shared by all instantiated
+// copies but it will require the module to provide synchronization; usually
+// it's safer to just treat the module as immutable and keep state within the
+// instantiated module states instead.
+iree_status_t iree_custom_module_sync_create(iree_vm_instance_t* instance,
+                                             iree_hal_device_t* device,
+                                             iree_allocator_t host_allocator,
+                                             iree_vm_module_t** out_module);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_SAMPLES_CUSTOM_MODULE_TENSOR_SYNC_MODULE_H_
diff --git a/samples/custom_module/sync/test/CMakeLists.txt b/samples/custom_module/sync/test/CMakeLists.txt
new file mode 100644
index 0000000..66489be
--- /dev/null
+++ b/samples/custom_module/sync/test/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+iree_lit_test_suite(
+  NAME
+    lit
+  SRCS
+    "example.mlir"
+  TOOLS
+    FileCheck
+    iree-compile
+    iree_samples_custom_module_sync_run
+  LABELS
+    "hostonly"
+)
diff --git a/samples/custom_module/sync/test/example.mlir b/samples/custom_module/sync/test/example.mlir
new file mode 100644
index 0000000..78faa31
--- /dev/null
+++ b/samples/custom_module/sync/test/example.mlir
@@ -0,0 +1,43 @@
+// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu | custom-module-sync-run - example.main | FileCheck %s
+
+module @example {
+  //===--------------------------------------------------------------------===//
+  // Imports
+  //===--------------------------------------------------------------------===//
+  // External function declarations for the methods implemented in the custom
+  // module C++ file. Note that they are prefixed with the `custom.` module
+  // name.
+
+  // Synchronous call that takes/returns a tensor.
+  // IREE will block and wait until the input tensor is available, make the
+  // import call, and assume that the returned tensor is immediately available
+  // for use.
+  func.func private @custom.call.sync(tensor<?xi32>) -> tensor<?xi32>
+
+  //===--------------------------------------------------------------------===//
+  // Sample methods
+  //===--------------------------------------------------------------------===//
+  // Note that there can be any number of publicly-exported methods; this simple
+  // sample just has one to keep things simple.
+
+  // CHECK-LABEL: INVOKE BEGIN example.main
+  func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    // Compiler-generated dispatch work to show dataflow.
+    %0 = arith.muli %arg0, %arg0 : tensor<?xi32>
+
+    // Custom call to a synchronous import.
+    // The runtime will block and wait until %0 is ready before making the call
+    // and assume it can immediately start using the resulting %1 after the call
+    // returns. Note that the top-level invocation will block while this call is
+    // made and if we were running the compiler-generated dispatches above/below
+    // on a GPU it would fully synchronize the host and device (really bad!).
+    %1 = call @custom.call.sync(%0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // More generated dispatch work to show dataflow.
+    %2 = arith.muli %1, %1 : tensor<?xi32>
+
+    // CHECK: MATCHED!
+    return %2 : tensor<?xi32>
+  }
+  // CHECK-NEXT: INVOKE END
+}