Adding custom_module/sync/ & custom_module/async/ samples. These show how to interact with tensor I/O when using either synchronous or asynchronous custom module calls.
diff --git a/samples/custom_module/sync/CMakeLists.txt b/samples/custom_module/sync/CMakeLists.txt new file mode 100644 index 0000000..9c52a8a --- /dev/null +++ b/samples/custom_module/sync/CMakeLists.txt
@@ -0,0 +1,34 @@ +# Copyright 2022 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# Sample requires the llvm-cpu compiler backend and the local-sync runtime +# driver. This could be made to work with other backends. +if(NOT IREE_TARGET_BACKEND_LLVM_CPU OR + NOT IREE_HAL_DRIVER_LOCAL_SYNC) + return() +endif() + +set(_NAME "iree_samples_custom_module_sync_run") +add_executable(${_NAME} "") +target_sources(${_NAME} + PRIVATE + main.c + module.cc + module.h +) + +set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "custom-module-sync-run") + +# TODO(benvanik): make iree_status_annotate_f always available as a function +# instead of defining it empty? otherwise optimized builds of the runtime won't +# export it but external libraries may pull it in. +target_compile_options(${_NAME} PRIVATE ${IREE_DEFAULT_COPTS}) + +target_link_libraries(${_NAME} + iree_runtime_runtime +) + +add_subdirectory(test)
diff --git a/samples/custom_module/sync/README.md b/samples/custom_module/sync/README.md new file mode 100644 index 0000000..96f8616 --- /dev/null +++ b/samples/custom_module/sync/README.md
@@ -0,0 +1,45 @@ +# Synchronous tensor I/O custom module sample + +This sample expects that you've already produced a working version of the +[basic sample](/samples/custom_module/basic/) (including compiler installation +and CMake setup). + +This sample demonstrates adding custom modules callable from compiler-produced +programs that take and return `tensor` types. By default custom calls are +treated as blocking operations that synchronize with the underlying device to +ensure all passed `tensor` buffer views are host coherent and it's assumed that +any returned `tensor` buffer views are ready for use when the call returns. + +This approach is the easiest to integrate and looks similar to classic ML +frameworks custom calls. There are many significant performance implications of +using this approach, though, and synchronous calls should only be used when +no asynchronous approach is possible. See the +[async tensor](/samples/custom_module/async/) sample for how to define +custom calls that work asynchronously. + +## Instructions + +1. Compile the [example module](./test/example.mlir) to a .vmfb file: + + ``` + iree-compile --iree-hal-target-backends=llvm-cpu samples/custom_module/sync/test/example.mlir -o=/tmp/example.vmfb + ``` + +2. Build the `iree_samples_custom_module_sync_run` CMake target : + + ``` + cmake -B ../iree-build/ -DCMAKE_BUILD_TYPE=RelWithDebInfo . \ + -DCMAKE_C_FLAGS=-DIREE_VM_EXECUTION_TRACING_FORCE_ENABLE=1 + cmake --build ../iree-build/ --target iree_samples_custom_module_sync_run + ``` + (here we force runtime execution tracing for demonstration purposes) + + [See here](https://iree-org.github.io/iree/building-from-source/getting-started/) + for general instructions on building using CMake. + +3. Run the example program to call the main function: + + ``` + ../iree-build/samples/custom_module/sync/custom-module-sync-run \ + /tmp/example.vmfb example.main + ```
diff --git a/samples/custom_module/sync/main.c b/samples/custom_module/sync/main.c new file mode 100644 index 0000000..4095dfe --- /dev/null +++ b/samples/custom_module/sync/main.c
@@ -0,0 +1,144 @@ +// Copyright 2022 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include <stdio.h> + +// IREE APIs: +#include "iree/modules/hal/types.h" +#include "iree/runtime/api.h" + +// Custom native module used in the sample. +// Modules may be linked in from native code or other bytecode modules loaded at +// runtime: there's no difference. +#include "module.h" + +// NOTE: CHECKs are dangerous but this is a sample; a real application would +// want to handle errors gracefully. We know in this constrained case that +// these won't fail unless something is catastrophically wrong (out of memory, +// solar flares, etc). +int main(int argc, char** argv) { + if (argc != 3) { + fprintf(stderr, + "Usage:\n" + " custom-module-sync-run - <entry.point> # read from stdin\n" + " custom-module-sync-run </path/to/say_hello.vmfb> " + "<entry.point>\n"); + fprintf(stderr, " (See the README for this sample for details)\n "); + return -1; + } + + // Internally IREE does not (in general) use malloc and instead uses the + // provided allocator to allocate and free memory. Applications can integrate + // their own allocator as-needed. + iree_allocator_t host_allocator = iree_allocator_system(); + + // Create and configure the instance shared across all sessions. + iree_runtime_instance_options_t instance_options; + iree_runtime_instance_options_initialize(&instance_options); + iree_runtime_instance_options_use_all_available_drivers(&instance_options); + iree_runtime_instance_t* instance = NULL; + IREE_CHECK_OK(iree_runtime_instance_create(&instance_options, host_allocator, + &instance)); + + // Try to create the device - it should always succeed as it's a CPU device. + iree_hal_device_t* device = NULL; + IREE_CHECK_OK(iree_runtime_instance_try_create_default_device( + instance, iree_make_cstring_view("local-sync"), &device)); + + // Create one session per loaded module to hold the module state. + iree_runtime_session_options_t session_options; + iree_runtime_session_options_initialize(&session_options); + iree_runtime_session_t* session = NULL; + IREE_CHECK_OK(iree_runtime_session_create_with_device( + instance, &session_options, device, + iree_runtime_instance_host_allocator(instance), &session)); + + // Create the custom module that can be reused across contexts. + iree_vm_module_t* custom_module = NULL; + IREE_CHECK_OK(iree_custom_module_sync_create( + iree_runtime_instance_vm_instance(instance), device, host_allocator, + &custom_module)); + IREE_CHECK_OK(iree_runtime_session_append_module(session, custom_module)); + iree_vm_module_release(custom_module); + + // Load the module from stdin or a file on disk. + const char* module_path = argv[1]; + if (strcmp(module_path, "-") == 0) { + IREE_CHECK_OK( + iree_runtime_session_append_bytecode_module_from_stdin(session)); + } else { + IREE_CHECK_OK(iree_runtime_session_append_bytecode_module_from_file( + session, module_path)); + } + + iree_string_view_t entry_point = iree_make_cstring_view(argv[2]); + fprintf(stdout, "INVOKE BEGIN %.*s\n", (int)entry_point.size, + entry_point.data); + fflush(stdout); + + iree_vm_list_t* inputs = NULL; + IREE_CHECK_OK(iree_vm_list_create(NULL, 1, host_allocator, &inputs)); + iree_vm_list_t* outputs = NULL; + IREE_CHECK_OK(iree_vm_list_create(NULL, 1, host_allocator, &outputs)); + + // Pass in the tensor<?xi32> arg: + const int32_t input_data[5] = {1, 2, 3, 4, 5}; + const iree_hal_dim_t shape[1] = {IREE_ARRAYSIZE(input_data)}; + iree_hal_buffer_view_t* input_view = NULL; + IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer( + iree_runtime_session_device_allocator(session), IREE_ARRAYSIZE(shape), + shape, IREE_HAL_ELEMENT_TYPE_INT_32, + IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, + (iree_hal_buffer_params_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL, + .access = IREE_HAL_MEMORY_ACCESS_READ, + .usage = IREE_HAL_BUFFER_USAGE_DEFAULT, + }, + iree_make_const_byte_span(input_data, sizeof(input_data)), &input_view)); + iree_vm_ref_t input_view_ref = iree_hal_buffer_view_move_ref(input_view); + IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs, &input_view_ref)); + + // Synchronously invoke the requested function. + IREE_CHECK_OK( + iree_runtime_session_call_by_name(session, entry_point, inputs, outputs)); + + // Read back the tensor<?xi32> result: + iree_hal_buffer_view_t* output_view = + iree_vm_list_get_buffer_view_assign(outputs, 0); + int32_t output_data[5] = {0}; + IREE_CHECK_OK( + iree_hal_buffer_map_read(iree_hal_buffer_view_buffer(output_view), 0, + output_data, sizeof(output_data))); + + // Expecting (e^2 * 2)^2: + bool did_match = true; + for (size_t i = 0; i < IREE_ARRAYSIZE(input_data); ++i) { + int32_t t0 = input_data[i]; + int32_t t1 = t0 * t0; + int32_t t2 = t1 * 2; + int32_t t3 = t2 * t2; + if (t3 != output_data[i]) { + fprintf(stdout, "MISMATCH [%zu] expected %d but actual %d\n", i, t3, + output_data[i]); + did_match = false; + break; + } + } + if (did_match) { + fprintf(stdout, "MATCHED!\n"); + } + + iree_vm_list_release(inputs); + iree_vm_list_release(outputs); + + fprintf(stdout, "INVOKE END\n"); + fflush(stdout); + + iree_runtime_session_release(session); + iree_hal_device_release(device); + iree_runtime_instance_release(instance); + return 0; +}
diff --git a/samples/custom_module/sync/module.cc b/samples/custom_module/sync/module.cc new file mode 100644 index 0000000..215088d --- /dev/null +++ b/samples/custom_module/sync/module.cc
@@ -0,0 +1,195 @@ +// Copyright 2022 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "module.h" + +#include <cstdio> +#include <thread> + +#include "iree/modules/hal/types.h" +#include "iree/vm/native_module_cc.h" + +// NOTE: this module is written in C++ using the native module wrapper and uses +// template magic to handle marshaling arguments. For a lot of uses this is a +// much friendlier way of exposing modules to the IREE VM and if performance and +// code size are not a concern is a fine route to take. Here we do it for +// brevity but all of the internal IREE modules are implemented in C. + +//===----------------------------------------------------------------------===// +// VM module interface implementation +//===----------------------------------------------------------------------===// + +namespace { + +using namespace iree; + +// Approximation of some external library call that populates a buffer. +// It's assumed that when this is called the |source_buffer| is available to +// read and the |target_buffer| is available to write (no other readers exist). +// This sample assumes that the buffers are mappable so we can do the work here +// but they will not always be. APIs like iree_hal_allocator_import_buffer and +// iree_hal_allocator_export_buffer can be used in some cases to avoid +// potentially expensive operations but real applications that care about +// performance would want to issue async transfer command buffers. +// +// Only use this as a reference for when synchronous behavior is absolutely +// required (old-style blocking file IO/etc). +static Status SyncSimulatedHostOpI32(iree_hal_buffer_t* source_buffer, + iree_hal_buffer_t* target_buffer, + iree_hal_dim_t count) { + Status status = OkStatus(); + + // Map the source and target buffers into host memory. Note that not all + // devices allow this but in this sample we assume they do. + iree_hal_buffer_mapping_t source_mapping = {{0}}; + if (status.ok()) { + status = iree_hal_buffer_map_range( + source_buffer, IREE_HAL_MAPPING_MODE_SCOPED, + IREE_HAL_MEMORY_ACCESS_READ, 0, IREE_WHOLE_BUFFER, &source_mapping); + } + iree_hal_buffer_mapping_t target_mapping = {{0}}; + if (status.ok()) { + status = + iree_hal_buffer_map_range(target_buffer, IREE_HAL_MAPPING_MODE_SCOPED, + IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE, 0, + IREE_WHOLE_BUFFER, &target_mapping); + } + + // Sad slow host work. Whenever possible it's worth it to move these into the + // program so the IREE compiler can fuse and accelerate these operations. + if (status.ok()) { + const int32_t* source_ptr = + reinterpret_cast<const int32_t*>(source_mapping.contents.data); + int32_t* target_ptr = + reinterpret_cast<int32_t*>(target_mapping.contents.data); + for (iree_host_size_t i = 0; i < count; ++i) { + target_ptr[i] = source_ptr[i] * 2; + } + } + + // We must unmap the buffers before they will be usable. + // Note that it's possible for these to fail in cases where the buffer + // required emulated mapping but on basic host-local devices like CPU assumed + // in this sample that should never happen. + iree_status_ignore(iree_hal_buffer_unmap_range(&source_mapping)); + iree_status_ignore(iree_hal_buffer_unmap_range(&target_mapping)); + + return status; +} + +// Per-context module state. +class CustomModuleState final { + public: + explicit CustomModuleState(vm::ref<iree_hal_device_t> device, + iree_allocator_t host_allocator) + : device_(std::move(device)), host_allocator_(host_allocator) {} + ~CustomModuleState() = default; + + StatusOr<vm::ref<iree_hal_buffer_view_t>> CallSync( + const vm::ref<iree_hal_buffer_view_t> arg_view) { + // We can directly access the buffer here but only for reading. + // In the future it'll be possible to pass in-place buffers. + auto* arg_buffer = iree_hal_buffer_view_buffer(arg_view.get()); + + // Synchronously allocate the memory from the device allocator. We could + // use queue-ordered allocations but that's unsafe to use from arbitrary + // threads and we want to show how to safely do that using the thread-safe + // device allocator. + // + // NOTE: if cloning host memory the initial_data can be passed in to + // efficiently upload the memory to the device. If wrapping host memory then + // iree_hal_allocator_import_buffer can be used to import the memory without + // a copy (if supported). This simple example is showing an in-place style + // external call. + iree_hal_allocator_t* device_allocator = + iree_hal_device_allocator(device_.get()); + iree_hal_buffer_params_t buffer_params = { + /*.usage=*/IREE_HAL_BUFFER_USAGE_DEFAULT | + IREE_HAL_BUFFER_USAGE_MAPPING, + /*.access=*/IREE_HAL_MEMORY_ACCESS_ALL, + /*.type=*/IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE | + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE, + /*.queue_affinity=*/IREE_HAL_QUEUE_AFFINITY_ANY, + /*.min_alignment=*/64, + }; + vm::ref<iree_hal_buffer_t> result_buffer; + IREE_RETURN_IF_ERROR(iree_hal_allocator_allocate_buffer( + device_allocator, buffer_params, + iree_hal_buffer_view_byte_length(arg_view.get()), + iree_const_byte_span_empty(), &result_buffer)); + + // Hacky example accessing the source contents and producing the result + // contents. This emulates what an external library the user is calling that + // expects host void* buffers does. + IREE_RETURN_IF_ERROR(SyncSimulatedHostOpI32( + arg_buffer, result_buffer.get(), + iree_hal_buffer_view_element_count(arg_view.get()))); + + // Wrap the buffer in a buffer view that provides the metadata for + // runtime verification. + vm::ref<iree_hal_buffer_view_t> result_view; + IREE_RETURN_IF_ERROR(iree_hal_buffer_view_create_like( + result_buffer.get(), arg_view.get(), host_allocator_, &result_view)); + + // Note that the caller may immediately use the buffer contents without + // waiting as by being synchronous we've indicated that we waited ourselves + // (the thread join above). + return result_view; + } + + private: + // HAL device used for scheduling work and allocations. + vm::ref<iree_hal_device_t> device_; + + // Allocator that the caller requested we use for any allocations we need to + // perform during operation. + iree_allocator_t host_allocator_; +}; + +// Function table mapping imported function names to their implementation. +static const vm::NativeFunction<CustomModuleState> kCustomModuleFunctions[] = { + vm::MakeNativeFunction("call.sync", &CustomModuleState::CallSync), +}; + +// The module instance that will be allocated and reused across contexts. +class CustomModule final : public vm::NativeModule<CustomModuleState> { + public: + using vm::NativeModule<CustomModuleState>::NativeModule; + + void SetDevice(vm::ref<iree_hal_device_t> device) { + device_ = std::move(device); + } + + // Creates per-context state when the module is added to a new context. + // May be called from any thread. + StatusOr<std::unique_ptr<CustomModuleState>> CreateState( + iree_allocator_t host_allocator) override { + auto state = std::make_unique<CustomModuleState>(vm::retain_ref(device_), + host_allocator); + return state; + } + + private: + vm::ref<iree_hal_device_t> device_; +}; + +} // namespace + +// Note that while we are using C++ bindings internally we still expose the +// module as a C instance. This hides the details of our implementation. +extern "C" iree_status_t iree_custom_module_sync_create( + iree_vm_instance_t* instance, iree_hal_device_t* device, + iree_allocator_t host_allocator, iree_vm_module_t** out_module) { + IREE_ASSERT_ARGUMENT(out_module); + *out_module = NULL; + auto module = std::make_unique<CustomModule>( + "custom", /*version=*/0, instance, host_allocator, + iree::span<const vm::NativeFunction<CustomModuleState>>( + kCustomModuleFunctions)); + module->SetDevice(vm::retain_ref(device)); + *out_module = module.release()->interface(); + return iree_ok_status(); +}
diff --git a/samples/custom_module/sync/module.h b/samples/custom_module/sync/module.h new file mode 100644 index 0000000..7e293af --- /dev/null +++ b/samples/custom_module/sync/module.h
@@ -0,0 +1,34 @@ +// Copyright 2022 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_SAMPLES_CUSTOM_MODULE_TENSOR_SYNC_MODULE_H_ +#define IREE_SAMPLES_CUSTOM_MODULE_TENSOR_SYNC_MODULE_H_ + +#include <stdint.h> + +#include "iree/base/api.h" +#include "iree/hal/api.h" +#include "iree/vm/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Creates a native custom module that can be reused in multiple contexts. +// The module itself may hold state that can be shared by all instantiated +// copies but it will require the module to provide synchronization; usually +// it's safer to just treat the module as immutable and keep state within the +// instantiated module states instead. +iree_status_t iree_custom_module_sync_create(iree_vm_instance_t* instance, + iree_hal_device_t* device, + iree_allocator_t host_allocator, + iree_vm_module_t** out_module); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_SAMPLES_CUSTOM_MODULE_TENSOR_SYNC_MODULE_H_
diff --git a/samples/custom_module/sync/test/CMakeLists.txt b/samples/custom_module/sync/test/CMakeLists.txt new file mode 100644 index 0000000..66489be --- /dev/null +++ b/samples/custom_module/sync/test/CMakeLists.txt
@@ -0,0 +1,18 @@ +# Copyright 2022 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +iree_lit_test_suite( + NAME + lit + SRCS + "example.mlir" + TOOLS + FileCheck + iree-compile + iree_samples_custom_module_sync_run + LABELS + "hostonly" +)
diff --git a/samples/custom_module/sync/test/example.mlir b/samples/custom_module/sync/test/example.mlir new file mode 100644 index 0000000..78faa31 --- /dev/null +++ b/samples/custom_module/sync/test/example.mlir
@@ -0,0 +1,43 @@ +// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu | custom-module-sync-run - example.main | FileCheck %s + +module @example { + //===--------------------------------------------------------------------===// + // Imports + //===--------------------------------------------------------------------===// + // External function declarations for the methods implemented in the custom + // module C++ file. Note that they are prefixed with the `custom.` module + // name. + + // Synchronous call that takes/returns a tensor. + // IREE will block and wait until the input tensor is available, make the + // import call, and assume that the returned tensor is immediately available + // for use. + func.func private @custom.call.sync(tensor<?xi32>) -> tensor<?xi32> + + //===--------------------------------------------------------------------===// + // Sample methods + //===--------------------------------------------------------------------===// + // Note that there can be any number of publicly-exported methods; this simple + // sample just has one to keep things simple. + + // CHECK-LABEL: INVOKE BEGIN example.main + func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> { + // Compiler-generated dispatch work to show dataflow. + %0 = arith.muli %arg0, %arg0 : tensor<?xi32> + + // Custom call to a synchronous import. + // The runtime will block and wait until %0 is ready before making the call + // and assume it can immediately start using the resulting %1 after the call + // returns. Note that the top-level invocation will block while this call is + // made and if we were running the compiler-generated dispatches above/below + // on a GPU it would fully synchronize the host and device (really bad!). + %1 = call @custom.call.sync(%0) : (tensor<?xi32>) -> tensor<?xi32> + + // More generated dispatch work to show dataflow. + %2 = arith.muli %1, %1 : tensor<?xi32> + + // CHECK: MATCHED! + return %2 : tensor<?xi32> + } + // CHECK-NEXT: INVOKE END +}