| // Copyright 2022 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include "module.h" |
| |
| #include <cstdio> |
| #include <thread> |
| |
| #include "iree/modules/hal/types.h" |
| #include "iree/vm/native_module_cc.h" |
| |
| // NOTE: this module is written in C++ using the native module wrapper and uses |
| // template magic to handle marshaling arguments. For a lot of uses this is a |
| // much friendlier way of exposing modules to the IREE VM and if performance and |
| // code size are not a concern is a fine route to take. Here we do it for |
| // brevity but all of the internal IREE modules are implemented in C. |
| |
| //===----------------------------------------------------------------------===// |
| // VM module interface implementation |
| //===----------------------------------------------------------------------===// |
| |
| namespace { |
| |
| using namespace iree; |
| |
| // Approximation of some external library call that populates a buffer. |
| // It's assumed that when this is called the |source_buffer| is available to |
| // read and the |target_buffer| is available to write (no other readers exist). |
| // This sample assumes that the buffers are mappable so we can do the work here |
| // but they will not always be. APIs like iree_hal_allocator_import_buffer and |
| // iree_hal_allocator_export_buffer can be used in some cases to avoid |
| // potentially expensive operations but real applications that care about |
| // performance would want to issue async transfer command buffers. |
| // |
| // Only use this as a reference for when synchronous behavior is absolutely |
| // required (old-style blocking file IO/etc). |
| static Status SyncSimulatedHostOpI32(iree_hal_buffer_t* source_buffer, |
| iree_hal_buffer_t* target_buffer, |
| iree_hal_dim_t count) { |
| Status status = OkStatus(); |
| |
| // Map the source and target buffers into host memory. Note that not all |
| // devices allow this but in this sample we assume they do. |
| iree_hal_buffer_mapping_t source_mapping = {{0}}; |
| if (status.ok()) { |
| status = iree_hal_buffer_map_range( |
| source_buffer, IREE_HAL_MAPPING_MODE_SCOPED, |
| IREE_HAL_MEMORY_ACCESS_READ, 0, IREE_WHOLE_BUFFER, &source_mapping); |
| } |
| iree_hal_buffer_mapping_t target_mapping = {{0}}; |
| if (status.ok()) { |
| status = |
| iree_hal_buffer_map_range(target_buffer, IREE_HAL_MAPPING_MODE_SCOPED, |
| IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE, 0, |
| IREE_WHOLE_BUFFER, &target_mapping); |
| } |
| |
| // Sad slow host work. Whenever possible it's worth it to move these into the |
| // program so the IREE compiler can fuse and accelerate these operations. |
| if (status.ok()) { |
| const int32_t* source_ptr = |
| reinterpret_cast<const int32_t*>(source_mapping.contents.data); |
| int32_t* target_ptr = |
| reinterpret_cast<int32_t*>(target_mapping.contents.data); |
| for (iree_host_size_t i = 0; i < count; ++i) { |
| target_ptr[i] = source_ptr[i] * 2; |
| } |
| } |
| |
| // We must unmap the buffers before they will be usable. |
| // Note that it's possible for these to fail in cases where the buffer |
| // required emulated mapping but on basic host-local devices like CPU assumed |
| // in this sample that should never happen. |
| iree_status_ignore(iree_hal_buffer_unmap_range(&source_mapping)); |
| iree_status_ignore(iree_hal_buffer_unmap_range(&target_mapping)); |
| |
| return status; |
| } |
| |
| // Represents some kind of stateful async operation. |
| // Here we spin up a thread to wait on the wait_fence, do some expensive work, |
| // and then signal the signal_fence. |
| // |
| // **This is not actually how this should be done** - spinning up a thread for |
| // each operation is extremely wasteful and doing so will contend with the |
| // threads IREE uses for scheduling its compute workloads. This is pretty much |
| // the worst way to run asynchronous work (but at least it's async!). Instead |
| // think of this as an example of calling off to some service/system layer where |
| // the ownership of the work scheduling is not in control of the application |
| // (like networking or RPC). |
| // |
| // Each AsyncOp instance is used for a single operation and deletes itself when |
| // the operation is complete. In order to prevent hangs it's critical that the |
| // signal_fence is signaled or marked as failing. |
| // |
| // TODO(benvanik): demonstrate getting the iree_task_executor_t for direct use. |
| class AsyncOp { |
| public: |
| static void Launch(vm::ref<iree_hal_buffer_view_t> source_view, |
| vm::ref<iree_hal_buffer_view_t> target_view, |
| vm::ref<iree_hal_fence_t> wait_fence, |
| vm::ref<iree_hal_fence_t> signal_fence) { |
| new AsyncOp(std::move(source_view), std::move(target_view), |
| std::move(wait_fence), std::move(signal_fence)); |
| } |
| |
| private: |
| AsyncOp(vm::ref<iree_hal_buffer_view_t> source_view, |
| vm::ref<iree_hal_buffer_view_t> target_view, |
| vm::ref<iree_hal_fence_t> wait_fence, |
| vm::ref<iree_hal_fence_t> signal_fence) |
| : source_view_(std::move(source_view)), |
| target_view_(std::move(target_view)), |
| wait_fence_(std::move(wait_fence)), |
| signal_fence_(std::move(signal_fence)), |
| thread_([this]() { |
| thread_.detach(); |
| ThreadEntry(); |
| delete this; // self cleanup |
| }) {} |
| |
| void ThreadEntry() { |
| IREE_TRACE_SET_THREAD_NAME("std-thread-worker"); |
| IREE_TRACE_SCOPE(); |
| |
| fprintf(stdout, "ASYNC: BEFORE WAIT\n"); |
| fflush(stdout); |
| |
| // Give a pause to simulate doing something expensive. |
| std::this_thread::sleep_for(std::chrono::milliseconds(1000)); |
| |
| // Wait until the tensor is ready for use. A real application could |
| // export the fence to a native wait handle they could use with syscalls |
| // or add the fence to a multi-wait operation. Here we just block the |
| // thread until ready. Due to the nature of ordering it's possible the |
| // fence has already been signaled by the time we get here. |
| Status status = |
| iree_hal_fence_wait(wait_fence_.get(), iree_infinite_timeout()); |
| |
| fprintf(stdout, "ASYNC: AFTER WAIT\n"); |
| fflush(stdout); |
| |
| // Perform the expensive work while the input tensor is known good and |
| // the output is ready to accept it. |
| if (status.ok()) { |
| // Hacky example accessing the source contents and producing the result |
| // contents. This emulates what an external library the user is calling |
| // that expects host void* buffers does. |
| status = SyncSimulatedHostOpI32( |
| iree_hal_buffer_view_buffer(source_view_.get()), |
| iree_hal_buffer_view_buffer(target_view_.get()), |
| iree_hal_buffer_view_element_count(source_view_.get())); |
| } |
| |
| fprintf(stdout, "ASYNC: BEFORE SIGNAL\n"); |
| fflush(stdout); |
| |
| // Try to signal completion so that downstream consumers of the result |
| // can get scheduled. |
| if (status.ok()) { |
| status = iree_hal_fence_signal(signal_fence_.get()); |
| } |
| |
| // If we failed then we propagate the failure status. This is likely to |
| // result in complete failure of the invocation though when the user is |
| // able to observe the failure is hard to determine as they may be |
| // pipelined N invocations deep by the time this runs. |
| if (!status.ok()) { |
| iree_hal_fence_fail(signal_fence_.get(), status.release()); |
| } |
| |
| fprintf(stdout, "ASYNC: AFTER SIGNAL\n"); |
| fflush(stdout); |
| } |
| |
| vm::ref<iree_hal_buffer_view_t> source_view_; |
| vm::ref<iree_hal_buffer_view_t> target_view_; |
| vm::ref<iree_hal_fence_t> wait_fence_; |
| vm::ref<iree_hal_fence_t> signal_fence_; |
| std::thread thread_; |
| }; |
| |
| // Per-context module state. |
| // This can contain "globals" and other arbitrary state. |
| // |
| // Thread-compatible; the runtime will not issue multiple calls at the same |
| // time using the same state. If the implementation uses external threads then |
| // it must synchronize itself. |
| class CustomModuleState final { |
| public: |
| explicit CustomModuleState(vm::ref<iree_hal_device_t> device, |
| iree_allocator_t host_allocator) |
| : device_(std::move(device)), host_allocator_(host_allocator) {} |
| ~CustomModuleState() = default; |
| |
| StatusOr<vm::ref<iree_hal_buffer_view_t>> CallAsync( |
| const vm::ref<iree_hal_buffer_view_t> arg_view, |
| const vm::ref<iree_hal_fence_t> wait_fence, |
| const vm::ref<iree_hal_fence_t> signal_fence) { |
| // TODO(benvanik): better fence helpers when timelines are not needed. |
| vm::ref<iree_hal_semaphore_t> semaphore; |
| IREE_RETURN_IF_ERROR(iree_hal_semaphore_create( |
| device_.get(), 0ull, IREE_HAL_SEMAPHORE_FLAG_NONE, &semaphore)); |
| vm::ref<iree_hal_fence_t> alloca_fence; |
| IREE_RETURN_IF_ERROR(iree_hal_fence_create_at( |
| semaphore.get(), 1ull, host_allocator_, &alloca_fence)); |
| |
| // Asynchronously allocate the output memory for the call result. |
| // This chains the allocation such that the wait_fence must be signaled |
| // before the memory is allocated and our alloca_fence will be used to |
| // sequence our work with the allocation: |
| // |
| // [wait_fence] -> alloca -> [alloca_fence] -> work -> [signal_fence] |
| // |
| // TODO(benvanik): extend to allowing result storage to be passed in (when |
| // possible to compute sizes). For now all results need to be allocated. |
| iree_hal_buffer_params_t buffer_params = { |
| /*.usage=*/IREE_HAL_BUFFER_USAGE_DEFAULT | |
| IREE_HAL_BUFFER_USAGE_MAPPING, |
| /*.access=*/IREE_HAL_MEMORY_ACCESS_ALL, |
| /*.type=*/IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE | |
| IREE_HAL_MEMORY_TYPE_HOST_VISIBLE, |
| /*.queue_affinity=*/IREE_HAL_QUEUE_AFFINITY_ANY, |
| /*.min_alignment=*/64, |
| }; |
| vm::ref<iree_hal_buffer_t> result_buffer; |
| IREE_RETURN_IF_ERROR(iree_hal_device_queue_alloca( |
| device_.get(), IREE_HAL_QUEUE_AFFINITY_ANY, |
| iree_hal_fence_semaphore_list(wait_fence.get()), |
| iree_hal_fence_semaphore_list(alloca_fence.get()), |
| IREE_HAL_ALLOCATOR_POOL_DEFAULT, buffer_params, |
| iree_hal_buffer_view_byte_length(arg_view.get()), &result_buffer)); |
| |
| // Wrap the buffer in a buffer view that provides the metadata for |
| // runtime verification. |
| vm::ref<iree_hal_buffer_view_t> result_view; |
| IREE_RETURN_IF_ERROR(iree_hal_buffer_view_create_like( |
| result_buffer.get(), arg_view.get(), host_allocator_, &result_view)); |
| |
| // Launch the stateful async operation. |
| // See the notes above - note that this is _not_ a good way of doing this! |
| // Note that we should be using host_allocator_ here to create these objects |
| // so that memory is properly tracked as originating from this call. |
| AsyncOp::Launch(vm::retain_ref(arg_view), vm::retain_ref(result_view), |
| std::move(alloca_fence), std::move(signal_fence)); |
| |
| // Note that the caller needs the buffer view back but is not allowed to |
| // access its contents until we signal the signal_fence. |
| return result_view; |
| } |
| |
| private: |
| // HAL device used for scheduling work and allocations. |
| vm::ref<iree_hal_device_t> device_; |
| |
| // Allocator that the caller requested we use for any allocations we need to |
| // perform during operation. |
| iree_allocator_t host_allocator_; |
| }; |
| |
| // Function table mapping imported function names to their implementation. |
| static const vm::NativeFunction<CustomModuleState> kCustomModuleFunctions[] = { |
| vm::MakeNativeFunction("call.async", &CustomModuleState::CallAsync), |
| }; |
| |
| // The module instance that will be allocated and reused across contexts. |
| // Any context-specific state must be stored in a state structure such as |
| // CustomModuleState. |
| // |
| // Assumed thread-safe (by construction here, as it's immutable), though if any |
| // mutable state is stored here it will need to be synchronized by the |
| // implementation. |
| class CustomModule final : public vm::NativeModule<CustomModuleState> { |
| public: |
| using vm::NativeModule<CustomModuleState>::NativeModule; |
| |
| void SetDevice(vm::ref<iree_hal_device_t> device) { |
| device_ = std::move(device); |
| } |
| |
| // Creates per-context state when the module is added to a new context. |
| // May be called from any thread. |
| StatusOr<std::unique_ptr<CustomModuleState>> CreateState( |
| iree_allocator_t host_allocator) override { |
| auto state = std::make_unique<CustomModuleState>(vm::retain_ref(device_), |
| host_allocator); |
| return state; |
| } |
| |
| private: |
| vm::ref<iree_hal_device_t> device_; |
| }; |
| |
| } // namespace |
| |
| // Note that while we are using C++ bindings internally we still expose the |
| // module as a C instance. This hides the details of our implementation. |
| extern "C" iree_status_t iree_custom_module_async_create( |
| iree_vm_instance_t* instance, iree_hal_device_t* device, |
| iree_allocator_t host_allocator, iree_vm_module_t** out_module) { |
| IREE_ASSERT_ARGUMENT(out_module); |
| *out_module = NULL; |
| |
| // NOTE: this isn't using the allocator here and that's bad as it leaves |
| // untracked allocations and pulls in the system allocator that may differ |
| // from the one requested by the user. |
| // TODO(benvanik): std::allocator wrapper around iree_allocator_t so this can |
| // use that instead. |
| auto module = std::make_unique<CustomModule>( |
| "custom", /*version=*/0, instance, host_allocator, |
| iree::span<const vm::NativeFunction<CustomModuleState>>( |
| kCustomModuleFunctions)); |
| module->SetDevice(vm::retain_ref(device)); |
| |
| *out_module = module.release()->interface(); |
| return iree_ok_status(); |
| } |