Adding custom_module/sync/ & custom_module/async/ samples. These show how to interact with tensor I/O when using either synchronous or asynchronous custom module calls.
diff --git a/samples/custom_module/README.md b/samples/custom_module/README.md index 5b603ed..be2423f 100644 --- a/samples/custom_module/README.md +++ b/samples/custom_module/README.md
@@ -13,3 +13,28 @@ * C++ VM wrappers for defining modules and using reference types * Weak imports/fallback functions * Custom types exposed to the compiler + +## Tensor I/O + +### Synchronous call sample + +[samples/custom_module/sync/](/samples/custom_module/sync/README.md) +shows how to pass tensors to and from custom module imports with synchronous +execution. This approximates what a classic ML synchronous custom op may do by +presenting the tensor I/O as if they were host-synchronous buffers. This is the +lowest-performance way of running custom code and should be avoided when +possible. + +* `tensor` types <-> HAL buffer views +* Host buffer mapping and manipulation + +### Asynchronous call sample + +[samples/custom_module/async/](/samples/custom_module/async/README.md) +shows how to pass tensors to and from custom module imports with asynchronous +execution. This shows how to move tensors across threads/frameworks in a +non-blocking way that allows IREE to overlap execution with custom user code. + +* `tensor` types <-> HAL buffer views +* Fences for waiting on inputs and signaling readiness of outputs +* Side-effect annotations for wait-free imports
diff --git a/samples/custom_module/async/CMakeLists.txt b/samples/custom_module/async/CMakeLists.txt new file mode 100644 index 0000000..0cf7821 --- /dev/null +++ b/samples/custom_module/async/CMakeLists.txt
@@ -0,0 +1,35 @@ +# Copyright 2022 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# Sample requires the llvm-cpu compiler backend and the local-task runtime +# driver. This could be made to work with other backends but async is only +# really useful to demonstrate with an async target. +if(NOT IREE_TARGET_BACKEND_LLVM_CPU OR + NOT IREE_HAL_DRIVER_LOCAL_TASK) + return() +endif() + +set(_NAME "iree_samples_custom_module_async_run") +add_executable(${_NAME} "") +target_sources(${_NAME} + PRIVATE + main.c + module.cc + module.h +) + +set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "custom-module-async-run") + +# TODO(benvanik): make iree_status_annotate_f always available as a function +# instead of defining it empty? otherwise optimized builds of the runtime won't +# export it but external libraries may pull it in. +target_compile_options(${_NAME} PRIVATE ${IREE_DEFAULT_COPTS}) + +target_link_libraries(${_NAME} + iree_runtime_runtime +) + +add_subdirectory(test)
diff --git a/samples/custom_module/async/README.md b/samples/custom_module/async/README.md new file mode 100644 index 0000000..18bae69 --- /dev/null +++ b/samples/custom_module/async/README.md
@@ -0,0 +1,50 @@ +# Synchronous tensor I/O custom module sample + +This sample expects that you've already produced a working version of the +[basic sample](/samples/custom_module/basic/) (including compiler installation +and CMake setup). + +This sample demonstrates adding custom modules callable from compiler-produced +programs that take and return `tensor` types. Both the calls into the compiled +program and the custom call made from the compiled program are made +asynchronously using HAL fences for ordering work. This allows the entire +invocation - including the custom user call - to be scheduled without blocking +and enables pipelining and overlapping invocations. When embedded into a larger +user-level framework this lets IREE invocations be interleaved with other user +work. + +## Instructions + +1. Compile the [example module](./test/example.mlir) to a .vmfb file: + + ``` + iree-compile \ + --iree-execution-model=async-external \ + --iree-hal-target-backends=llvm-cpu \ + samples/custom_module/async/test/example.mlir \ + -o=/tmp/example.vmfb + ``` + +2. Build the `iree_samples_custom_module_async_run` CMake target : + + ``` + cmake -B ../iree-build/ -DCMAKE_BUILD_TYPE=RelWithDebInfo . \ + -DCMAKE_C_FLAGS=-DIREE_VM_EXECUTION_TRACING_FORCE_ENABLE=1 + cmake --build ../iree-build/ --target iree_samples_custom_module_async_run + ``` + (here we force runtime execution tracing for demonstration purposes) + + [See here](https://iree-org.github.io/iree/building-from-source/getting-started/) + for general instructions on building using CMake. + +3. Run the example program to call the main function: + + ``` + ../iree-build/samples/custom_module/async/custom-module-async-run \ + /tmp/example.vmfb example.main + ``` + +## TBD + +* Expose a way to tie call arguments and results for in-place operations. +* Expose a way to specify the lifetime of the I/O to allow for transient memory.
diff --git a/samples/custom_module/async/main.c b/samples/custom_module/async/main.c new file mode 100644 index 0000000..5e2a768 --- /dev/null +++ b/samples/custom_module/async/main.c
@@ -0,0 +1,188 @@ +// Copyright 2022 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include <stdio.h> + +// IREE APIs: +#include "iree/modules/hal/types.h" +#include "iree/runtime/api.h" + +// Custom native module used in the sample. +// Modules may be linked in from native code or other bytecode modules loaded at +// runtime: there's no difference. +#include "module.h" + +// NOTE: CHECKs are dangerous but this is a sample; a real application would +// want to handle errors gracefully. We know in this constrained case that +// these won't fail unless something is catastrophically wrong (out of memory, +// solar flares, etc). +int main(int argc, char** argv) { + if (argc != 3) { + fprintf(stderr, + "Usage:\n" + " custom-module-async-run - <entry.point> # read from stdin\n" + " custom-module-async-run </path/to/say_hello.vmfb> " + "<entry.point>\n"); + fprintf(stderr, " (See the README for this sample for details)\n "); + return -1; + } + + // Internally IREE does not (in general) use malloc and instead uses the + // provided allocator to allocate and free memory. Applications can integrate + // their own allocator as-needed. + iree_allocator_t host_allocator = iree_allocator_system(); + + // Create and configure the instance shared across all sessions. + iree_runtime_instance_options_t instance_options; + iree_runtime_instance_options_initialize(&instance_options); + iree_runtime_instance_options_use_all_available_drivers(&instance_options); + iree_runtime_instance_t* instance = NULL; + IREE_CHECK_OK(iree_runtime_instance_create(&instance_options, host_allocator, + &instance)); + + // Try to create the device - it should always succeed as it's a CPU device. + iree_hal_device_t* device = NULL; + IREE_CHECK_OK(iree_runtime_instance_try_create_default_device( + instance, iree_make_cstring_view("local-task"), &device)); + + // Create one session per loaded module to hold the module state. + iree_runtime_session_options_t session_options; + iree_runtime_session_options_initialize(&session_options); + // Useful to see the VM program flow: + // session_options.context_flags = IREE_VM_CONTEXT_FLAG_TRACE_EXECUTION; + iree_runtime_session_t* session = NULL; + IREE_CHECK_OK(iree_runtime_session_create_with_device( + instance, &session_options, device, + iree_runtime_instance_host_allocator(instance), &session)); + + // Create the custom module that can be reused across contexts. + iree_vm_module_t* custom_module = NULL; + IREE_CHECK_OK(iree_custom_module_async_create( + iree_runtime_instance_vm_instance(instance), device, host_allocator, + &custom_module)); + IREE_CHECK_OK(iree_runtime_session_append_module(session, custom_module)); + iree_vm_module_release(custom_module); + + // Load the module from stdin or a file on disk. + const char* module_path = argv[1]; + if (strcmp(module_path, "-") == 0) { + IREE_CHECK_OK( + iree_runtime_session_append_bytecode_module_from_stdin(session)); + } else { + IREE_CHECK_OK(iree_runtime_session_append_bytecode_module_from_file( + session, module_path)); + } + + iree_vm_list_t* inputs = NULL; + IREE_CHECK_OK(iree_vm_list_create(NULL, 1, host_allocator, &inputs)); + iree_vm_list_t* outputs = NULL; + IREE_CHECK_OK(iree_vm_list_create(NULL, 1, host_allocator, &outputs)); + + // Pass in the tensor<?xi32> arg: + const int32_t input_data[5] = {1, 2, 3, 4, 5}; + const iree_hal_dim_t shape[1] = {IREE_ARRAYSIZE(input_data)}; + iree_hal_buffer_view_t* input_view = NULL; + IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer( + iree_runtime_session_device_allocator(session), IREE_ARRAYSIZE(shape), + shape, IREE_HAL_ELEMENT_TYPE_INT_32, + IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, + (iree_hal_buffer_params_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL, + .access = IREE_HAL_MEMORY_ACCESS_READ, + .usage = IREE_HAL_BUFFER_USAGE_DEFAULT, + }, + iree_make_const_byte_span(input_data, sizeof(input_data)), &input_view)); + iree_vm_ref_t input_view_ref = iree_hal_buffer_view_move_ref(input_view); + IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs, &input_view_ref)); + + // Create our own timeline and set fences at T=1 and T=2. + // We'll pass these in with the timeline at T=0 so that the runtime isn't + // allowed to execute anything until we give it the go-ahead. + iree_hal_semaphore_t* semaphore = NULL; + IREE_CHECK_OK(iree_hal_semaphore_create(device, 0ull, &semaphore)); + iree_hal_fence_t* fence_t1 = NULL; + IREE_CHECK_OK( + iree_hal_fence_create_at(semaphore, 1ull, host_allocator, &fence_t1)); + iree_hal_fence_t* fence_t2 = NULL; + IREE_CHECK_OK( + iree_hal_fence_create_at(semaphore, 2ull, host_allocator, &fence_t2)); + iree_hal_semaphore_release(semaphore); + fprintf(stdout, "INITIALIZE T=0\n"); + fflush(stdout); + + // Add the (wait_fence, signal_fence) pair to the function call. + // The --iree-execution-model=async-external flag adds these required + // arguments to the functions exported by the module. + iree_vm_ref_t fence_t1_ref = iree_hal_fence_retain_ref(fence_t1); + IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs, &fence_t1_ref)); + iree_vm_ref_t fence_t2_ref = iree_hal_fence_retain_ref(fence_t2); + IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs, &fence_t2_ref)); + + // Let the call start executing by signaling the timeline to T=1. + // TODO(benvanik): fix wait-before-signal on queue-ordered allocations. + // For now we have to signal to T=1 before invoking the function but that's + // only temporary. This should be moved down to after the VM invocation + // returns so that we can show how all of the program execution can be + // deferred. We could simulate this with another thread that signaled in the + // future if we wanted. + IREE_CHECK_OK(iree_hal_fence_signal(fence_t1)); + fprintf(stdout, "SIGNALED T=1\n"); + fflush(stdout); + + // Invoke the target function. + // This will return immediately after scheduling work - including the custom + // call - but will not actually execute anything until we say it's OK by + // advancing the timeline to T=1. + iree_string_view_t entry_point = iree_make_cstring_view(argv[2]); + fprintf(stdout, "VM INVOKE BEGIN %.*s\n", (int)entry_point.size, + entry_point.data); + fflush(stdout); + IREE_CHECK_OK( + iree_runtime_session_call_by_name(session, entry_point, inputs, outputs)); + fprintf(stdout, "VM INVOKE END\n"); + fflush(stdout); + + // We could go do other things now while the async work progresses. Here we + // just immediately wait. + IREE_CHECK_OK(iree_hal_fence_wait(fence_t2, iree_infinite_timeout())); + fprintf(stdout, "REACHED T=2\n"); + fflush(stdout); + + // Read back the tensor<?xi32> result: + iree_hal_buffer_view_t* output_view = + iree_vm_list_get_buffer_view_assign(outputs, 0); + int32_t output_data[5] = {0}; + IREE_CHECK_OK( + iree_hal_buffer_map_read(iree_hal_buffer_view_buffer(output_view), 0, + output_data, sizeof(output_data))); + + // Expecting (e^2 * 2)^2: + bool did_match = true; + for (size_t i = 0; i < IREE_ARRAYSIZE(input_data); ++i) { + int32_t t0 = input_data[i]; + int32_t t1 = t0 * t0; + int32_t t2 = t1 * 2; + int32_t t3 = t2 * t2; + if (t3 != output_data[i]) { + fprintf(stdout, "MISMATCH [%zu] expected %d but actual %d\n", i, t3, + output_data[i]); + did_match = false; + break; + } + } + if (did_match) { + fprintf(stdout, "MATCHED!\n"); + } + + iree_vm_list_release(inputs); + iree_vm_list_release(outputs); + iree_hal_fence_release(fence_t1); + iree_hal_fence_release(fence_t2); + iree_runtime_session_release(session); + iree_hal_device_release(device); + iree_runtime_instance_release(instance); + return 0; +}
diff --git a/samples/custom_module/async/module.cc b/samples/custom_module/async/module.cc new file mode 100644 index 0000000..8a4bde7 --- /dev/null +++ b/samples/custom_module/async/module.cc
@@ -0,0 +1,313 @@ +// Copyright 2022 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "module.h" + +#include <cstdio> +#include <thread> + +#include "iree/modules/hal/types.h" +#include "iree/vm/native_module_cc.h" + +// NOTE: this module is written in C++ using the native module wrapper and uses +// template magic to handle marshaling arguments. For a lot of uses this is a +// much friendlier way of exposing modules to the IREE VM and if performance and +// code size are not a concern is a fine route to take. Here we do it for +// brevity but all of the internal IREE modules are implemented in C. + +//===----------------------------------------------------------------------===// +// VM module interface implementation +//===----------------------------------------------------------------------===// + +namespace { + +using namespace iree; + +// Approximation of some external library call that populates a buffer. +// It's assumed that when this is called the |source_buffer| is available to +// read and the |target_buffer| is available to write (no other readers exist). +// This sample assumes that the buffers are mappable so we can do the work here +// but they will not always be. APIs like iree_hal_allocator_import_buffer and +// iree_hal_allocator_export_buffer can be used in some cases to avoid +// potentially expensive operations but real applications that care about +// performance would want to issue async transfer command buffers. +// +// Only use this as a reference for when synchronous behavior is absolutely +// required (old-style blocking file IO/etc). +static Status SyncSimulatedHostOpI32(iree_hal_buffer_t* source_buffer, + iree_hal_buffer_t* target_buffer, + iree_hal_dim_t count) { + Status status = OkStatus(); + + // Map the source and target buffers into host memory. Note that not all + // devices allow this but in this sample we assume they do. + iree_hal_buffer_mapping_t source_mapping = {{0}}; + if (status.ok()) { + status = iree_hal_buffer_map_range( + source_buffer, IREE_HAL_MAPPING_MODE_SCOPED, + IREE_HAL_MEMORY_ACCESS_READ, 0, IREE_WHOLE_BUFFER, &source_mapping); + } + iree_hal_buffer_mapping_t target_mapping = {{0}}; + if (status.ok()) { + status = + iree_hal_buffer_map_range(target_buffer, IREE_HAL_MAPPING_MODE_SCOPED, + IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE, 0, + IREE_WHOLE_BUFFER, &target_mapping); + } + + // Sad slow host work. Whenever possible it's worth it to move these into the + // program so the IREE compiler can fuse and accelerate these operations. + if (status.ok()) { + const int32_t* source_ptr = + reinterpret_cast<const int32_t*>(source_mapping.contents.data); + int32_t* target_ptr = + reinterpret_cast<int32_t*>(target_mapping.contents.data); + for (iree_host_size_t i = 0; i < count; ++i) { + target_ptr[i] = source_ptr[i] * 2; + } + } + + // We must unmap the buffers before they will be usable. + // Note that it's possible for these to fail in cases where the buffer + // required emulated mapping but on basic host-local devices like CPU assumed + // in this sample that should never happen. + iree_status_ignore(iree_hal_buffer_unmap_range(&source_mapping)); + iree_status_ignore(iree_hal_buffer_unmap_range(&target_mapping)); + + return status; +} + +// Represents some kind of stateful async operation. +// Here we spin up a thread to wait on the wait_fence, do some expensive work, +// and then signal the signal_fence. +// +// **This is not actually how this should be done** - spinning up a thread for +// each operation is extremely wasteful and doing so will contend with the +// threads IREE uses for scheduling its compute workloads. This is pretty much +// the worst way to run asynchronous work (but at least it's async!). Instead +// think of this as an example of calling off to some service/system layer where +// the ownership of the work scheduling is not in control of the application +// (like networking or RPC). +// +// Each AsyncOp instance is used for a single operation and deletes itself when +// the operation is complete. In order to prevent hangs it's critical that the +// signal_fence is signaled or marked as failing. +// +// TODO(benvanik): demonstrate getting the iree_task_executor_t for direct use. +class AsyncOp { + public: + static void Launch(vm::ref<iree_hal_buffer_view_t> source_view, + vm::ref<iree_hal_buffer_view_t> target_view, + vm::ref<iree_hal_fence_t> wait_fence, + vm::ref<iree_hal_fence_t> signal_fence) { + new AsyncOp(std::move(source_view), std::move(target_view), + std::move(wait_fence), std::move(signal_fence)); + } + + private: + AsyncOp(vm::ref<iree_hal_buffer_view_t> source_view, + vm::ref<iree_hal_buffer_view_t> target_view, + vm::ref<iree_hal_fence_t> wait_fence, + vm::ref<iree_hal_fence_t> signal_fence) + : source_view_(std::move(source_view)), + target_view_(std::move(target_view)), + wait_fence_(std::move(wait_fence)), + signal_fence_(std::move(signal_fence)), + thread_([this]() { + thread_.detach(); + ThreadEntry(); + delete this; // self cleanup + }) {} + + void ThreadEntry() { + IREE_TRACE_SET_THREAD_NAME("std-thread-worker"); + IREE_TRACE_SCOPE(); + + fprintf(stdout, "ASYNC: BEFORE WAIT\n"); + fflush(stdout); + + // Give a pause to simulate doing something expensive. + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + + // Wait until the tensor is ready for use. A real application could + // export the fence to a native wait handle they could use with syscalls + // or add the fence to a multi-wait operation. Here we just block the + // thread until ready. Due to the nature of ordering it's possible the + // fence has already been signaled by the time we get here. + Status status = + iree_hal_fence_wait(wait_fence_.get(), iree_infinite_timeout()); + + fprintf(stdout, "ASYNC: AFTER WAIT\n"); + fflush(stdout); + + // Perform the expensive work while the input tensor is known good and + // the output is ready to accept it. + if (status.ok()) { + // Hacky example accessing the source contents and producing the result + // contents. This emulates what an external library the user is calling + // that expects host void* buffers does. + status = SyncSimulatedHostOpI32( + iree_hal_buffer_view_buffer(source_view_.get()), + iree_hal_buffer_view_buffer(target_view_.get()), + iree_hal_buffer_view_element_count(source_view_.get())); + } + + fprintf(stdout, "ASYNC: BEFORE SIGNAL\n"); + fflush(stdout); + + // Try to signal completion so that downstream consumers of the result + // can get scheduled. + if (status.ok()) { + status = iree_hal_fence_signal(signal_fence_.get()); + } + + // If we failed then we propagate the failure status. This is likely to + // result in complete failure of the invocation though when the user is + // able to observe the failure is hard to determine as they may be + // pipelined N invocations deep by the time this runs. + if (!status.ok()) { + iree_hal_fence_fail(signal_fence_.get(), status.release()); + } + + fprintf(stdout, "ASYNC: AFTER SIGNAL\n"); + fflush(stdout); + } + + vm::ref<iree_hal_buffer_view_t> source_view_; + vm::ref<iree_hal_buffer_view_t> target_view_; + vm::ref<iree_hal_fence_t> wait_fence_; + vm::ref<iree_hal_fence_t> signal_fence_; + std::thread thread_; +}; + +// Per-context module state. +// This can contain "globals" and other arbitrary state. +// +// Thread-compatible; the runtime will not issue multiple calls at the same +// time using the same state. If the implementation uses external threads then +// it must synchronize itself. +class CustomModuleState final { + public: + explicit CustomModuleState(vm::ref<iree_hal_device_t> device, + iree_allocator_t host_allocator) + : device_(std::move(device)), host_allocator_(host_allocator) {} + ~CustomModuleState() = default; + + StatusOr<vm::ref<iree_hal_buffer_view_t>> CallAsync( + const vm::ref<iree_hal_buffer_view_t> arg_view, + const vm::ref<iree_hal_fence_t> wait_fence, + const vm::ref<iree_hal_fence_t> signal_fence) { + // TODO(benvanik): better fence helpers when timelines are not needed. + vm::ref<iree_hal_semaphore_t> semaphore; + IREE_RETURN_IF_ERROR( + iree_hal_semaphore_create(device_.get(), 0ull, &semaphore)); + vm::ref<iree_hal_fence_t> alloca_fence; + IREE_RETURN_IF_ERROR(iree_hal_fence_create_at( + semaphore.get(), 1ull, host_allocator_, &alloca_fence)); + + // Asynchronously allocate the output memory for the call result. + // This chains the allocation such that the wait_fence must be signaled + // before the memory is allocated and our alloca_fence will be used to + // sequence our work with the allocation: + // + // [wait_fence] -> alloca -> [alloca_fence] -> work -> [signal_fence] + // + // TODO(benvanik): extend to allowing result storage to be passed in (when + // possible to compute sizes). For now all results need to be allocated. + iree_hal_buffer_params_t buffer_params = { + /*.usage=*/IREE_HAL_BUFFER_USAGE_DEFAULT | + IREE_HAL_BUFFER_USAGE_MAPPING, + /*.access=*/IREE_HAL_MEMORY_ACCESS_ALL, + /*.type=*/IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE | + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE, + /*.queue_affinity=*/IREE_HAL_QUEUE_AFFINITY_ANY, + /*.min_alignment=*/64, + }; + vm::ref<iree_hal_buffer_t> result_buffer; + IREE_RETURN_IF_ERROR(iree_hal_device_queue_alloca( + device_.get(), IREE_HAL_QUEUE_AFFINITY_ANY, + iree_hal_fence_semaphore_list(wait_fence.get()), + iree_hal_fence_semaphore_list(alloca_fence.get()), + IREE_HAL_ALLOCATOR_POOL_DEFAULT, buffer_params, + iree_hal_buffer_view_byte_length(arg_view.get()), &result_buffer)); + + // Wrap the buffer in a buffer view that provides the metadata for + // runtime verification. + vm::ref<iree_hal_buffer_view_t> result_view; + IREE_RETURN_IF_ERROR(iree_hal_buffer_view_create_like( + result_buffer.get(), arg_view.get(), host_allocator_, &result_view)); + + // Launch the stateful async operation. + // See the notes above - note that this is _not_ a good way of doing this! + // Note that we should be using host_allocator_ here to create these objects + // so that memory is properly tracked as originating from this call. + AsyncOp::Launch(vm::retain_ref(arg_view), vm::retain_ref(result_view), + std::move(alloca_fence), std::move(signal_fence)); + + // Note that the caller needs the buffer view back but is not allowed to + // access its contents until we signal the signal_fence. + return result_view; + } + + private: + // HAL device used for scheduling work and allocations. + vm::ref<iree_hal_device_t> device_; + + // Allocator that the caller requested we use for any allocations we need to + // perform during operation. + iree_allocator_t host_allocator_; +}; + +// Function table mapping imported function names to their implementation. +static const vm::NativeFunction<CustomModuleState> kCustomModuleFunctions[] = { + vm::MakeNativeFunction("call.async", &CustomModuleState::CallAsync), +}; + +// The module instance that will be allocated and reused across contexts. +// Any context-specific state must be stored in a state structure such as +// CustomModuleState. +// +// Assumed thread-safe (by construction here, as it's immutable), though if any +// mutable state is stored here it will need to be synchronized by the +// implementation. +class CustomModule final : public vm::NativeModule<CustomModuleState> { + public: + using vm::NativeModule<CustomModuleState>::NativeModule; + + void SetDevice(vm::ref<iree_hal_device_t> device) { + device_ = std::move(device); + } + + // Creates per-context state when the module is added to a new context. + // May be called from any thread. + StatusOr<std::unique_ptr<CustomModuleState>> CreateState( + iree_allocator_t host_allocator) override { + auto state = std::make_unique<CustomModuleState>(vm::retain_ref(device_), + host_allocator); + return state; + } + + private: + vm::ref<iree_hal_device_t> device_; +}; + +} // namespace + +// Note that while we are using C++ bindings internally we still expose the +// module as a C instance. This hides the details of our implementation. +extern "C" iree_status_t iree_custom_module_async_create( + iree_vm_instance_t* instance, iree_hal_device_t* device, + iree_allocator_t host_allocator, iree_vm_module_t** out_module) { + IREE_ASSERT_ARGUMENT(out_module); + *out_module = NULL; + auto module = std::make_unique<CustomModule>( + "custom", /*version=*/0, instance, host_allocator, + iree::span<const vm::NativeFunction<CustomModuleState>>( + kCustomModuleFunctions)); + module->SetDevice(vm::retain_ref(device)); + *out_module = module.release()->interface(); + return iree_ok_status(); +}
diff --git a/samples/custom_module/async/module.h b/samples/custom_module/async/module.h new file mode 100644 index 0000000..f255c1b --- /dev/null +++ b/samples/custom_module/async/module.h
@@ -0,0 +1,34 @@ +// Copyright 2022 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_SAMPLES_CUSTOM_MODULE_TENSOR_ASYNC_MODULE_H_ +#define IREE_SAMPLES_CUSTOM_MODULE_TENSOR_ASYNC_MODULE_H_ + +#include <stdint.h> + +#include "iree/base/api.h" +#include "iree/hal/api.h" +#include "iree/vm/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Creates a native custom module that can be reused in multiple contexts. +// The module itself may hold state that can be shared by all instantiated +// copies but it will require the module to provide synchronization; usually +// it's safer to just treat the module as immutable and keep state within the +// instantiated module states instead. +iree_status_t iree_custom_module_async_create(iree_vm_instance_t* instance, + iree_hal_device_t* device, + iree_allocator_t host_allocator, + iree_vm_module_t** out_module); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_SAMPLES_CUSTOM_MODULE_TENSOR_ASYNC_MODULE_H_
diff --git a/samples/custom_module/async/test/CMakeLists.txt b/samples/custom_module/async/test/CMakeLists.txt new file mode 100644 index 0000000..60b8ad6 --- /dev/null +++ b/samples/custom_module/async/test/CMakeLists.txt
@@ -0,0 +1,18 @@ +# Copyright 2022 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +iree_lit_test_suite( + NAME + lit + SRCS + "example.mlir" + TOOLS + FileCheck + iree-compile + iree_samples_custom_module_async_run + LABELS + "hostonly" +)
diff --git a/samples/custom_module/async/test/example.mlir b/samples/custom_module/async/test/example.mlir new file mode 100644 index 0000000..245e287 --- /dev/null +++ b/samples/custom_module/async/test/example.mlir
@@ -0,0 +1,61 @@ +// RUN: iree-compile %s --iree-execution-model=async-external --iree-hal-target-backends=llvm-cpu | custom-module-async-run - example.main | FileCheck %s + +module @example { + //===--------------------------------------------------------------------===// + // Imports + //===--------------------------------------------------------------------===// + // External function declarations for the methods implemented in the custom + // module C++ file. Note that they are prefixed with the `custom.` module + // name. + + // Asynchronous call that takes/returns a tensor. + // IREE will pass in a HAL fence indicating when the input tensor is available + // and a HAL fence that the call can use to indicate when the returned tensor + // is available. It's expected that the call will not block. + // + // Note that `nosideeffects` is critical to ensuring asynchronous execution. + // When omitted IREE will still pass in the fences but wait on the signal + // fence after the call completes before continuing. This may be required when + // returning custom types or synchronizing with external systems. + func.func private @custom.call.async(tensor<?xi32>) -> tensor<?xi32> attributes { + iree.abi.model = "coarse-fences", + nosideeffects + } + + //===--------------------------------------------------------------------===// + // Sample methods + //===--------------------------------------------------------------------===// + // Note that there can be any number of publicly-exported methods; this simple + // sample just has one to keep things simple. + + func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> { + // Compiler-generated dispatch work to show dataflow. + %0 = arith.muli %arg0, %arg0 : tensor<?xi32> + + // Custom call to an asynchronous import. + // The runtime will chain together the async work to produce %0 and make the + // call with a wait fence indicating when %0 is ready. The call *should* + // return immediately with a newly allocated but not yet populated %1. The + // runtime will then continue to chain the subsequent %2 work pending the + // signal from the call indicating that %1 is ready for use. + // + // Note that allocations are generally blocking unless performed with the + // queue-ordered allocation APIs that chain on to fences. + %1 = call @custom.call.async(%0) : (tensor<?xi32>) -> tensor<?xi32> + + // More generated dispatch work to show dataflow. + %2 = arith.muli %1, %1 : tensor<?xi32> + + return %2 : tensor<?xi32> + } + + // TODO(benvanik): fix wait-before-signal on queue-ordered allocations. + // For now we have to signal to T=1 before invoking the function but that's + // only temporary. + // CHECK: INITIALIZE T=0 + // CHECK: SIGNALED T=1 + // CHECK: VM INVOKE BEGIN example.main + // CHECK: VM INVOKE END + // CHECK: REACHED T=2 + // CHECK: MATCHED! +}
diff --git a/samples/custom_module/sync/CMakeLists.txt b/samples/custom_module/sync/CMakeLists.txt new file mode 100644 index 0000000..9c52a8a --- /dev/null +++ b/samples/custom_module/sync/CMakeLists.txt
@@ -0,0 +1,34 @@ +# Copyright 2022 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# Sample requires the llvm-cpu compiler backend and the local-sync runtime +# driver. This could be made to work with other backends. +if(NOT IREE_TARGET_BACKEND_LLVM_CPU OR + NOT IREE_HAL_DRIVER_LOCAL_SYNC) + return() +endif() + +set(_NAME "iree_samples_custom_module_sync_run") +add_executable(${_NAME} "") +target_sources(${_NAME} + PRIVATE + main.c + module.cc + module.h +) + +set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "custom-module-sync-run") + +# TODO(benvanik): make iree_status_annotate_f always available as a function +# instead of defining it empty? otherwise optimized builds of the runtime won't +# export it but external libraries may pull it in. +target_compile_options(${_NAME} PRIVATE ${IREE_DEFAULT_COPTS}) + +target_link_libraries(${_NAME} + iree_runtime_runtime +) + +add_subdirectory(test)
diff --git a/samples/custom_module/sync/README.md b/samples/custom_module/sync/README.md new file mode 100644 index 0000000..96f8616 --- /dev/null +++ b/samples/custom_module/sync/README.md
@@ -0,0 +1,45 @@ +# Synchronous tensor I/O custom module sample + +This sample expects that you've already produced a working version of the +[basic sample](/samples/custom_module/basic/) (including compiler installation +and CMake setup). + +This sample demonstrates adding custom modules callable from compiler-produced +programs that take and return `tensor` types. By default custom calls are +treated as blocking operations that synchronize with the underlying device to +ensure all passed `tensor` buffer views are host coherent and it's assumed that +any returned `tensor` buffer views are ready for use when the call returns. + +This approach is the easiest to integrate and looks similar to classic ML +frameworks custom calls. There are many significant performance implications of +using this approach, though, and synchronous calls should only be used when +no asynchronous approach is possible. See the +[async tensor](/samples/custom_module/async/) sample for how to define +custom calls that work asynchronously. + +## Instructions + +1. Compile the [example module](./test/example.mlir) to a .vmfb file: + + ``` + iree-compile --iree-hal-target-backends=llvm-cpu samples/custom_module/sync/test/example.mlir -o=/tmp/example.vmfb + ``` + +2. Build the `iree_samples_custom_module_sync_run` CMake target : + + ``` + cmake -B ../iree-build/ -DCMAKE_BUILD_TYPE=RelWithDebInfo . \ + -DCMAKE_C_FLAGS=-DIREE_VM_EXECUTION_TRACING_FORCE_ENABLE=1 + cmake --build ../iree-build/ --target iree_samples_custom_module_sync_run + ``` + (here we force runtime execution tracing for demonstration purposes) + + [See here](https://iree-org.github.io/iree/building-from-source/getting-started/) + for general instructions on building using CMake. + +3. Run the example program to call the main function: + + ``` + ../iree-build/samples/custom_module/sync/custom-module-sync-run \ + /tmp/example.vmfb example.main + ```
diff --git a/samples/custom_module/sync/main.c b/samples/custom_module/sync/main.c new file mode 100644 index 0000000..4095dfe --- /dev/null +++ b/samples/custom_module/sync/main.c
@@ -0,0 +1,144 @@ +// Copyright 2022 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include <stdio.h> + +// IREE APIs: +#include "iree/modules/hal/types.h" +#include "iree/runtime/api.h" + +// Custom native module used in the sample. +// Modules may be linked in from native code or other bytecode modules loaded at +// runtime: there's no difference. +#include "module.h" + +// NOTE: CHECKs are dangerous but this is a sample; a real application would +// want to handle errors gracefully. We know in this constrained case that +// these won't fail unless something is catastrophically wrong (out of memory, +// solar flares, etc). +int main(int argc, char** argv) { + if (argc != 3) { + fprintf(stderr, + "Usage:\n" + " custom-module-sync-run - <entry.point> # read from stdin\n" + " custom-module-sync-run </path/to/say_hello.vmfb> " + "<entry.point>\n"); + fprintf(stderr, " (See the README for this sample for details)\n "); + return -1; + } + + // Internally IREE does not (in general) use malloc and instead uses the + // provided allocator to allocate and free memory. Applications can integrate + // their own allocator as-needed. + iree_allocator_t host_allocator = iree_allocator_system(); + + // Create and configure the instance shared across all sessions. + iree_runtime_instance_options_t instance_options; + iree_runtime_instance_options_initialize(&instance_options); + iree_runtime_instance_options_use_all_available_drivers(&instance_options); + iree_runtime_instance_t* instance = NULL; + IREE_CHECK_OK(iree_runtime_instance_create(&instance_options, host_allocator, + &instance)); + + // Try to create the device - it should always succeed as it's a CPU device. + iree_hal_device_t* device = NULL; + IREE_CHECK_OK(iree_runtime_instance_try_create_default_device( + instance, iree_make_cstring_view("local-sync"), &device)); + + // Create one session per loaded module to hold the module state. + iree_runtime_session_options_t session_options; + iree_runtime_session_options_initialize(&session_options); + iree_runtime_session_t* session = NULL; + IREE_CHECK_OK(iree_runtime_session_create_with_device( + instance, &session_options, device, + iree_runtime_instance_host_allocator(instance), &session)); + + // Create the custom module that can be reused across contexts. + iree_vm_module_t* custom_module = NULL; + IREE_CHECK_OK(iree_custom_module_sync_create( + iree_runtime_instance_vm_instance(instance), device, host_allocator, + &custom_module)); + IREE_CHECK_OK(iree_runtime_session_append_module(session, custom_module)); + iree_vm_module_release(custom_module); + + // Load the module from stdin or a file on disk. + const char* module_path = argv[1]; + if (strcmp(module_path, "-") == 0) { + IREE_CHECK_OK( + iree_runtime_session_append_bytecode_module_from_stdin(session)); + } else { + IREE_CHECK_OK(iree_runtime_session_append_bytecode_module_from_file( + session, module_path)); + } + + iree_string_view_t entry_point = iree_make_cstring_view(argv[2]); + fprintf(stdout, "INVOKE BEGIN %.*s\n", (int)entry_point.size, + entry_point.data); + fflush(stdout); + + iree_vm_list_t* inputs = NULL; + IREE_CHECK_OK(iree_vm_list_create(NULL, 1, host_allocator, &inputs)); + iree_vm_list_t* outputs = NULL; + IREE_CHECK_OK(iree_vm_list_create(NULL, 1, host_allocator, &outputs)); + + // Pass in the tensor<?xi32> arg: + const int32_t input_data[5] = {1, 2, 3, 4, 5}; + const iree_hal_dim_t shape[1] = {IREE_ARRAYSIZE(input_data)}; + iree_hal_buffer_view_t* input_view = NULL; + IREE_CHECK_OK(iree_hal_buffer_view_allocate_buffer( + iree_runtime_session_device_allocator(session), IREE_ARRAYSIZE(shape), + shape, IREE_HAL_ELEMENT_TYPE_INT_32, + IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, + (iree_hal_buffer_params_t){ + .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL, + .access = IREE_HAL_MEMORY_ACCESS_READ, + .usage = IREE_HAL_BUFFER_USAGE_DEFAULT, + }, + iree_make_const_byte_span(input_data, sizeof(input_data)), &input_view)); + iree_vm_ref_t input_view_ref = iree_hal_buffer_view_move_ref(input_view); + IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs, &input_view_ref)); + + // Synchronously invoke the requested function. + IREE_CHECK_OK( + iree_runtime_session_call_by_name(session, entry_point, inputs, outputs)); + + // Read back the tensor<?xi32> result: + iree_hal_buffer_view_t* output_view = + iree_vm_list_get_buffer_view_assign(outputs, 0); + int32_t output_data[5] = {0}; + IREE_CHECK_OK( + iree_hal_buffer_map_read(iree_hal_buffer_view_buffer(output_view), 0, + output_data, sizeof(output_data))); + + // Expecting (e^2 * 2)^2: + bool did_match = true; + for (size_t i = 0; i < IREE_ARRAYSIZE(input_data); ++i) { + int32_t t0 = input_data[i]; + int32_t t1 = t0 * t0; + int32_t t2 = t1 * 2; + int32_t t3 = t2 * t2; + if (t3 != output_data[i]) { + fprintf(stdout, "MISMATCH [%zu] expected %d but actual %d\n", i, t3, + output_data[i]); + did_match = false; + break; + } + } + if (did_match) { + fprintf(stdout, "MATCHED!\n"); + } + + iree_vm_list_release(inputs); + iree_vm_list_release(outputs); + + fprintf(stdout, "INVOKE END\n"); + fflush(stdout); + + iree_runtime_session_release(session); + iree_hal_device_release(device); + iree_runtime_instance_release(instance); + return 0; +}
diff --git a/samples/custom_module/sync/module.cc b/samples/custom_module/sync/module.cc new file mode 100644 index 0000000..215088d --- /dev/null +++ b/samples/custom_module/sync/module.cc
@@ -0,0 +1,195 @@ +// Copyright 2022 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "module.h" + +#include <cstdio> +#include <thread> + +#include "iree/modules/hal/types.h" +#include "iree/vm/native_module_cc.h" + +// NOTE: this module is written in C++ using the native module wrapper and uses +// template magic to handle marshaling arguments. For a lot of uses this is a +// much friendlier way of exposing modules to the IREE VM and if performance and +// code size are not a concern is a fine route to take. Here we do it for +// brevity but all of the internal IREE modules are implemented in C. + +//===----------------------------------------------------------------------===// +// VM module interface implementation +//===----------------------------------------------------------------------===// + +namespace { + +using namespace iree; + +// Approximation of some external library call that populates a buffer. +// It's assumed that when this is called the |source_buffer| is available to +// read and the |target_buffer| is available to write (no other readers exist). +// This sample assumes that the buffers are mappable so we can do the work here +// but they will not always be. APIs like iree_hal_allocator_import_buffer and +// iree_hal_allocator_export_buffer can be used in some cases to avoid +// potentially expensive operations but real applications that care about +// performance would want to issue async transfer command buffers. +// +// Only use this as a reference for when synchronous behavior is absolutely +// required (old-style blocking file IO/etc). +static Status SyncSimulatedHostOpI32(iree_hal_buffer_t* source_buffer, + iree_hal_buffer_t* target_buffer, + iree_hal_dim_t count) { + Status status = OkStatus(); + + // Map the source and target buffers into host memory. Note that not all + // devices allow this but in this sample we assume they do. + iree_hal_buffer_mapping_t source_mapping = {{0}}; + if (status.ok()) { + status = iree_hal_buffer_map_range( + source_buffer, IREE_HAL_MAPPING_MODE_SCOPED, + IREE_HAL_MEMORY_ACCESS_READ, 0, IREE_WHOLE_BUFFER, &source_mapping); + } + iree_hal_buffer_mapping_t target_mapping = {{0}}; + if (status.ok()) { + status = + iree_hal_buffer_map_range(target_buffer, IREE_HAL_MAPPING_MODE_SCOPED, + IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE, 0, + IREE_WHOLE_BUFFER, &target_mapping); + } + + // Sad slow host work. Whenever possible it's worth it to move these into the + // program so the IREE compiler can fuse and accelerate these operations. + if (status.ok()) { + const int32_t* source_ptr = + reinterpret_cast<const int32_t*>(source_mapping.contents.data); + int32_t* target_ptr = + reinterpret_cast<int32_t*>(target_mapping.contents.data); + for (iree_host_size_t i = 0; i < count; ++i) { + target_ptr[i] = source_ptr[i] * 2; + } + } + + // We must unmap the buffers before they will be usable. + // Note that it's possible for these to fail in cases where the buffer + // required emulated mapping but on basic host-local devices like CPU assumed + // in this sample that should never happen. + iree_status_ignore(iree_hal_buffer_unmap_range(&source_mapping)); + iree_status_ignore(iree_hal_buffer_unmap_range(&target_mapping)); + + return status; +} + +// Per-context module state. +class CustomModuleState final { + public: + explicit CustomModuleState(vm::ref<iree_hal_device_t> device, + iree_allocator_t host_allocator) + : device_(std::move(device)), host_allocator_(host_allocator) {} + ~CustomModuleState() = default; + + StatusOr<vm::ref<iree_hal_buffer_view_t>> CallSync( + const vm::ref<iree_hal_buffer_view_t> arg_view) { + // We can directly access the buffer here but only for reading. + // In the future it'll be possible to pass in-place buffers. + auto* arg_buffer = iree_hal_buffer_view_buffer(arg_view.get()); + + // Synchronously allocate the memory from the device allocator. We could + // use queue-ordered allocations but that's unsafe to use from arbitrary + // threads and we want to show how to safely do that using the thread-safe + // device allocator. + // + // NOTE: if cloning host memory the initial_data can be passed in to + // efficiently upload the memory to the device. If wrapping host memory then + // iree_hal_allocator_import_buffer can be used to import the memory without + // a copy (if supported). This simple example is showing an in-place style + // external call. + iree_hal_allocator_t* device_allocator = + iree_hal_device_allocator(device_.get()); + iree_hal_buffer_params_t buffer_params = { + /*.usage=*/IREE_HAL_BUFFER_USAGE_DEFAULT | + IREE_HAL_BUFFER_USAGE_MAPPING, + /*.access=*/IREE_HAL_MEMORY_ACCESS_ALL, + /*.type=*/IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE | + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE, + /*.queue_affinity=*/IREE_HAL_QUEUE_AFFINITY_ANY, + /*.min_alignment=*/64, + }; + vm::ref<iree_hal_buffer_t> result_buffer; + IREE_RETURN_IF_ERROR(iree_hal_allocator_allocate_buffer( + device_allocator, buffer_params, + iree_hal_buffer_view_byte_length(arg_view.get()), + iree_const_byte_span_empty(), &result_buffer)); + + // Hacky example accessing the source contents and producing the result + // contents. This emulates what an external library the user is calling that + // expects host void* buffers does. + IREE_RETURN_IF_ERROR(SyncSimulatedHostOpI32( + arg_buffer, result_buffer.get(), + iree_hal_buffer_view_element_count(arg_view.get()))); + + // Wrap the buffer in a buffer view that provides the metadata for + // runtime verification. + vm::ref<iree_hal_buffer_view_t> result_view; + IREE_RETURN_IF_ERROR(iree_hal_buffer_view_create_like( + result_buffer.get(), arg_view.get(), host_allocator_, &result_view)); + + // Note that the caller may immediately use the buffer contents without + // waiting as by being synchronous we've indicated that we waited ourselves + // (the thread join above). + return result_view; + } + + private: + // HAL device used for scheduling work and allocations. + vm::ref<iree_hal_device_t> device_; + + // Allocator that the caller requested we use for any allocations we need to + // perform during operation. + iree_allocator_t host_allocator_; +}; + +// Function table mapping imported function names to their implementation. +static const vm::NativeFunction<CustomModuleState> kCustomModuleFunctions[] = { + vm::MakeNativeFunction("call.sync", &CustomModuleState::CallSync), +}; + +// The module instance that will be allocated and reused across contexts. +class CustomModule final : public vm::NativeModule<CustomModuleState> { + public: + using vm::NativeModule<CustomModuleState>::NativeModule; + + void SetDevice(vm::ref<iree_hal_device_t> device) { + device_ = std::move(device); + } + + // Creates per-context state when the module is added to a new context. + // May be called from any thread. + StatusOr<std::unique_ptr<CustomModuleState>> CreateState( + iree_allocator_t host_allocator) override { + auto state = std::make_unique<CustomModuleState>(vm::retain_ref(device_), + host_allocator); + return state; + } + + private: + vm::ref<iree_hal_device_t> device_; +}; + +} // namespace + +// Note that while we are using C++ bindings internally we still expose the +// module as a C instance. This hides the details of our implementation. +extern "C" iree_status_t iree_custom_module_sync_create( + iree_vm_instance_t* instance, iree_hal_device_t* device, + iree_allocator_t host_allocator, iree_vm_module_t** out_module) { + IREE_ASSERT_ARGUMENT(out_module); + *out_module = NULL; + auto module = std::make_unique<CustomModule>( + "custom", /*version=*/0, instance, host_allocator, + iree::span<const vm::NativeFunction<CustomModuleState>>( + kCustomModuleFunctions)); + module->SetDevice(vm::retain_ref(device)); + *out_module = module.release()->interface(); + return iree_ok_status(); +}
diff --git a/samples/custom_module/sync/module.h b/samples/custom_module/sync/module.h new file mode 100644 index 0000000..7e293af --- /dev/null +++ b/samples/custom_module/sync/module.h
@@ -0,0 +1,34 @@ +// Copyright 2022 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_SAMPLES_CUSTOM_MODULE_TENSOR_SYNC_MODULE_H_ +#define IREE_SAMPLES_CUSTOM_MODULE_TENSOR_SYNC_MODULE_H_ + +#include <stdint.h> + +#include "iree/base/api.h" +#include "iree/hal/api.h" +#include "iree/vm/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Creates a native custom module that can be reused in multiple contexts. +// The module itself may hold state that can be shared by all instantiated +// copies but it will require the module to provide synchronization; usually +// it's safer to just treat the module as immutable and keep state within the +// instantiated module states instead. +iree_status_t iree_custom_module_sync_create(iree_vm_instance_t* instance, + iree_hal_device_t* device, + iree_allocator_t host_allocator, + iree_vm_module_t** out_module); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_SAMPLES_CUSTOM_MODULE_TENSOR_SYNC_MODULE_H_
diff --git a/samples/custom_module/sync/test/CMakeLists.txt b/samples/custom_module/sync/test/CMakeLists.txt new file mode 100644 index 0000000..66489be --- /dev/null +++ b/samples/custom_module/sync/test/CMakeLists.txt
@@ -0,0 +1,18 @@ +# Copyright 2022 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +iree_lit_test_suite( + NAME + lit + SRCS + "example.mlir" + TOOLS + FileCheck + iree-compile + iree_samples_custom_module_sync_run + LABELS + "hostonly" +)
diff --git a/samples/custom_module/sync/test/example.mlir b/samples/custom_module/sync/test/example.mlir new file mode 100644 index 0000000..78faa31 --- /dev/null +++ b/samples/custom_module/sync/test/example.mlir
@@ -0,0 +1,43 @@ +// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu | custom-module-sync-run - example.main | FileCheck %s + +module @example { + //===--------------------------------------------------------------------===// + // Imports + //===--------------------------------------------------------------------===// + // External function declarations for the methods implemented in the custom + // module C++ file. Note that they are prefixed with the `custom.` module + // name. + + // Synchronous call that takes/returns a tensor. + // IREE will block and wait until the input tensor is available, make the + // import call, and assume that the returned tensor is immediately available + // for use. + func.func private @custom.call.sync(tensor<?xi32>) -> tensor<?xi32> + + //===--------------------------------------------------------------------===// + // Sample methods + //===--------------------------------------------------------------------===// + // Note that there can be any number of publicly-exported methods; this simple + // sample just has one to keep things simple. + + // CHECK-LABEL: INVOKE BEGIN example.main + func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> { + // Compiler-generated dispatch work to show dataflow. + %0 = arith.muli %arg0, %arg0 : tensor<?xi32> + + // Custom call to a synchronous import. + // The runtime will block and wait until %0 is ready before making the call + // and assume it can immediately start using the resulting %1 after the call + // returns. Note that the top-level invocation will block while this call is + // made and if we were running the compiler-generated dispatches above/below + // on a GPU it would fully synchronize the host and device (really bad!). + %1 = call @custom.call.sync(%0) : (tensor<?xi32>) -> tensor<?xi32> + + // More generated dispatch work to show dataflow. + %2 = arith.muli %1, %1 : tensor<?xi32> + + // CHECK: MATCHED! + return %2 : tensor<?xi32> + } + // CHECK-NEXT: INVOKE END +}