Adding multiple_modules sample (and fixing bugs). (#15653)

This demonstrates multiple VM modules calling each other in both
synchronous and asynchronous modes. This is useful for both designing
reusable components as well as during testing/development/benchmarking
as it shows how easily pipelines can be constructed for running models
that may have state or non-trivial call sequences. Users can make as
many calls as they want or have their own state or control flow in the
pipelines to build arbitrarily complex sequences of work without needing
to author python/C/yaml for iree-run-trace.
diff --git a/compiler/src/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp b/compiler/src/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp
index 977de34..e9bdfc9 100644
--- a/compiler/src/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp
+++ b/compiler/src/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp
@@ -1302,7 +1302,6 @@
                   ConversionPatternRewriter &rewriter) const override {
     TypeConverter::SignatureConversion signatureConverter(
         funcOp.getFunctionType().getNumInputs());
-    TypeConverter typeConverter;
     for (const auto &arg : llvm::enumerate(funcOp.getArguments())) {
       Type convertedType =
           getTypeConverter()->convertType(arg.value().getType());
@@ -1691,6 +1690,13 @@
                                  emitc::OpaqueAttr::get(ctx, memberName)}),
             /*templateArgs=*/ArrayAttr{},
             /*operands=*/ArrayRef<Value>{value});
+        rewriter.create<emitc::CallOp>(
+            /*location=*/memberPtr.getLoc(),
+            /*type=*/TypeRange{},
+            /*callee=*/StringAttr::get(ctx, "iree_vm_ref_retain_inplace"),
+            /*args=*/ArrayAttr{},
+            /*templateArgs=*/ArrayAttr{},
+            /*operands=*/ArrayRef<Value>{memberPtr.getResult(0)});
         argumentStruct.callArguments.push_back(memberPtr.getResult(0));
       } else {
         Type memberType = input.value();
diff --git a/runtime/src/iree/base/internal/flags.c b/runtime/src/iree/base/internal/flags.c
index d4ce20e..b522364 100644
--- a/runtime/src/iree/base/internal/flags.c
+++ b/runtime/src/iree/base/internal/flags.c
@@ -160,24 +160,33 @@
     // Switching from inline storage to external storage.
     iree_host_size_t new_capacity = 4;
     iree_string_view_t* values = NULL;
-    IREE_RETURN_IF_ERROR(iree_allocator_malloc(
-        iree_flags_leaky_allocator(), sizeof(iree_string_view_t) * new_capacity,
-        (void**)&values));
+    IREE_TRACE_ZONE_BEGIN_NAMED(z_outline,
+                                "iree_flag_string_list_parse_outline");
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z_outline,
+        iree_allocator_malloc(iree_flags_leaky_allocator(),
+                              sizeof(iree_string_view_t) * new_capacity,
+                              (void**)&values));
     values[0] = flag->inline_value;
     flag->capacity = new_capacity;
     flag->values = values;
     flag->values[flag->count++] = value;
+    IREE_TRACE_ZONE_END(z_outline);
   } else if (flag->count < flag->capacity) {  // external storage available
     // Stash in external storage list.
     flag->values[flag->count++] = value;
   } else {  // external storage full
     // Growing external storage list.
+    IREE_TRACE_ZONE_BEGIN_NAMED(z_grow, "iree_flag_string_list_parse_grow");
     iree_host_size_t new_capacity = iree_max(4, flag->capacity * 2);
-    IREE_RETURN_IF_ERROR(iree_allocator_realloc(
-        iree_flags_leaky_allocator(), sizeof(iree_string_view_t) * new_capacity,
-        (void**)&flag->values));
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z_grow,
+        iree_allocator_realloc(iree_flags_leaky_allocator(),
+                               sizeof(iree_string_view_t) * new_capacity,
+                               (void**)&flag->values));
     flag->capacity = new_capacity;
     flag->values[flag->count++] = value;
+    IREE_TRACE_ZONE_END(z_grow);
   }
   return iree_ok_status();
 }
diff --git a/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c b/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c
index e83efd3..265c0b6 100644
--- a/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c
+++ b/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c
@@ -501,11 +501,6 @@
       .workgroup_count_z = dispatch_state->workgroup_count_z,
   };
 
-  // Call arguments are retained by the caller.
-  iree_vm_list_retain(binding_list);            // for call
-  iree_vm_buffer_retain(&local_memory_buffer);  // for call
-  iree_vm_buffer_retain(&constants_buffer);     // for call
-
   // VM stack stored on native stack. We really do abuse the stack too much
   // here but it's 8KB and that should be reasonable given that there isn't too
   // much above us in the stack.
diff --git a/runtime/src/iree/modules/check/check_test.cc b/runtime/src/iree/modules/check/check_test.cc
index 7623fb5..0966580 100644
--- a/runtime/src/iree/modules/check/check_test.cc
+++ b/runtime/src/iree/modules/check/check_test.cc
@@ -201,7 +201,7 @@
     IREE_RETURN_IF_ERROR(
         iree_vm_list_push_ref_move(inputs_.get(), &device_ref));
     for (auto& arg : args) {
-      iree_vm_ref_t arg_ref = iree_hal_buffer_view_move_ref(arg.get());
+      iree_vm_ref_t arg_ref = iree_hal_buffer_view_retain_ref(arg.get());
       IREE_RETURN_IF_ERROR(iree_vm_list_push_ref_move(inputs_.get(), &arg_ref));
     }
     return Invoke(function_name);
diff --git a/runtime/src/iree/tooling/run_module.c b/runtime/src/iree/tooling/run_module.c
index 2af3db4..4886b57 100644
--- a/runtime/src/iree/tooling/run_module.c
+++ b/runtime/src/iree/tooling/run_module.c
@@ -232,6 +232,7 @@
         iree_hal_fence_wait(finish_fence, iree_infinite_timeout()),
         "waiting on finish fence");
   }
+  iree_hal_fence_release(finish_fence);
 
   // End profiling after waiting for the invocation to finish.
   if (iree_status_is_ok(status)) {
diff --git a/runtime/src/iree/vm/bytecode/dispatch.c b/runtime/src/iree/vm/bytecode/dispatch.c
index 84fad05..f812e77 100644
--- a/runtime/src/iree/vm/bytecode/dispatch.c
+++ b/runtime/src/iree/vm/bytecode/dispatch.c
@@ -188,7 +188,7 @@
       } break;
       case IREE_VM_CCONV_TYPE_REF: {
         uint16_t dst_reg = ref_reg++;
-        iree_vm_ref_move(
+        iree_vm_ref_retain(
             (iree_vm_ref_t*)p,
             &callee_registers.ref[dst_reg & IREE_REF_REGISTER_MASK]);
         p += sizeof(iree_vm_ref_t);
diff --git a/runtime/src/iree/vm/invocation.c b/runtime/src/iree/vm/invocation.c
index 3b1eaea..2ba5bab 100644
--- a/runtime/src/iree/vm/invocation.c
+++ b/runtime/src/iree/vm/invocation.c
@@ -133,7 +133,7 @@
         // TODO(benvanik): see if we can't remove this retain by instead relying
         // on the caller still owning the list.
         IREE_RETURN_IF_ERROR(
-            iree_vm_list_get_ref_retain(inputs, arg_i, (iree_vm_ref_t*)p));
+            iree_vm_list_get_ref_assign(inputs, arg_i, (iree_vm_ref_t*)p));
         p += sizeof(iree_vm_ref_t);
       } break;
     }
diff --git a/runtime/src/iree/vm/ref.c b/runtime/src/iree/vm/ref.c
index 69c77d4..1edb903 100644
--- a/runtime/src/iree/vm/ref.c
+++ b/runtime/src/iree/vm/ref.c
@@ -19,6 +19,7 @@
     iree_vm_ref_t* ref);
 
 static void iree_vm_ref_trace(const char* msg, iree_vm_ref_t* ref) {
+  if (!ref->ptr) return;
   volatile iree_atomic_ref_count_t* counter = iree_vm_get_ref_counter_ptr(ref);
   iree_string_view_t name = iree_vm_ref_type_name(ref->type);
   fprintf(stderr, "%s %.*s 0x%p %d\n", msg, (int)name.size, name.data, ref->ptr,
@@ -26,6 +27,7 @@
 }
 static void iree_vm_ref_ptr_trace(const char* msg, void* ptr,
                                   iree_vm_ref_type_t type) {
+  if (!ptr) return;
   volatile iree_atomic_ref_count_t* counter =
       iree_vm_get_raw_counter_ptr(ptr, type);
   iree_string_view_t name = iree_vm_ref_type_name(type);
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 2d7691e..03589aa 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -16,6 +16,7 @@
 add_subdirectory(custom_module)
 add_subdirectory(dynamic_shapes)
 add_subdirectory(emitc_modules)
+add_subdirectory(multiple_modules)
 add_subdirectory(py_custom_module)
 add_subdirectory(simple_embedding)
 add_subdirectory(static_library)
diff --git a/samples/multiple_modules/BUILD.bazel b/samples/multiple_modules/BUILD.bazel
new file mode 100644
index 0000000..ae32309
--- /dev/null
+++ b/samples/multiple_modules/BUILD.bazel
@@ -0,0 +1,34 @@
+# Copyright 2023 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
+
+package(
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_lit_test_suite(
+    name = "lit",
+    srcs = [
+        "pipeline_async.mlir",
+        "pipeline_sync.mlir",
+    ],
+    cfg = "//samples:lit.cfg.py",
+    data = [
+        "module_a.mlir",
+        "module_b.mlir",
+    ],
+    tags = [
+        "driver=local-task",
+        "hostonly",
+    ],
+    tools = [
+        "//tools:iree-compile",
+        "//tools:iree-run-module",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/samples/multiple_modules/CMakeLists.txt b/samples/multiple_modules/CMakeLists.txt
new file mode 100644
index 0000000..68ec83a
--- /dev/null
+++ b/samples/multiple_modules/CMakeLists.txt
@@ -0,0 +1,31 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# samples/multiple_modules/BUILD.bazel                                         #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_lit_test_suite(
+  NAME
+    lit
+  SRCS
+    "pipeline_async.mlir"
+    "pipeline_sync.mlir"
+  TOOLS
+    FileCheck
+    iree-compile
+    iree-run-module
+  DATA
+    module_a.mlir
+    module_b.mlir
+  LABELS
+    "driver=local-task"
+    "hostonly"
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/samples/multiple_modules/README.md b/samples/multiple_modules/README.md
new file mode 100644
index 0000000..36b5cb9
--- /dev/null
+++ b/samples/multiple_modules/README.md
@@ -0,0 +1,84 @@
+# Multiple Modules / Pipelines
+
+IREE programs can be composed of multiple modules and calls can be made between
+them even if the modules are implemented manually (C/C++/python/etc) or
+generated by the compiler (VMFB/emitc). Modules have a user-specified load
+order and can only reference modules loaded prior. Each module can contain
+global state that is scoped to the VM context that loaded them to allow for
+multiple copies of a program to be loaded in a single hosting process.
+
+This sample contains two modules produced by the compiler
+([`module_a.mlir`](./module_a.mlir) and [`module_b.mlir`](./module_b.mlir)) that
+export some basic functions and a pipeline module implemented for both
+synchronous ([`pipeline_sync.mlir`](./pipeline_sync.mlir)) and asynchronous
+execution ([`pipeline_async.mlir`](./pipeline_async.mlir)) that calls into the
+modules. This shows how common/reusable portions of programs can be factored
+into multiple components and complex stateful pipelines can be easily authored
+that use those components.
+
+## Instructions (synchronous)
+
+1. Compile each module to .vmfb files:
+
+    ```
+    iree-compile \
+        --iree-hal-target-backends=vmvx \
+        module_a.mlir -o=module_a_sync.vmfb
+    iree-compile \
+        --iree-hal-target-backends=vmvx \
+        module_b.mlir -o=module_b_sync.vmfb
+    iree-compile \
+        --iree-hal-target-backends=vmvx \
+        pipeline_sync.mlir -o=pipeline_sync.vmfb
+    ```
+
+2. Run the program by passing in the modules in load order (dependencies first):
+
+    ```
+    iree-run-module \
+        --device=local-sync \
+        --module=module_a_sync.vmfb \
+        --module=module_b_sync.vmfb \
+        --module=pipeline_sync.vmfb \
+        --function=run \
+        --input=4096xf32=-2.0
+    ```
+
+## Instructions (asynchronous)
+
+The compilation portion is the same as above just with the addition of the flag
+telling the compiler to make the modules export functions with async support.
+It's possible to have modules that contain a mix of synchronous and asynchronous
+exports and calls if desired. `iree-run-module` and other IREE tooling
+automatically detects asynchronous calls and handles adding the wait and signal
+fences while applications directly using IREE APIs for execution will need to
+add the fences themselves.
+
+1. Compile each module to .vmfb files with the `async-external` ABI model:
+
+    ```
+    iree-compile \
+        --iree-execution-model=async-external \
+        --iree-hal-target-backends=vmvx \
+        module_a.mlir -o=module_a_async.vmfb
+    iree-compile \
+        --iree-execution-model=async-external \
+        --iree-hal-target-backends=vmvx \
+        module_b.mlir -o=module_b_async.vmfb
+    iree-compile \
+        --iree-execution-model=async-external \
+        --iree-hal-target-backends=vmvx \
+        pipeline_async.mlir -o=pipeline_async.vmfb
+    ```
+
+2. Run the program by passing in the modules in load order (dependencies first):
+
+    ```
+    iree-run-module \
+        --device=local-sync \
+        --module=module_a_async.vmfb \
+        --module=module_b_async.vmfb \
+        --module=pipeline_async.vmfb \
+        --function=run \
+        --input=4096xf32=-2.0
+    ```
diff --git a/samples/multiple_modules/module_a.mlir b/samples/multiple_modules/module_a.mlir
new file mode 100644
index 0000000..48c8c26
--- /dev/null
+++ b/samples/multiple_modules/module_a.mlir
@@ -0,0 +1,6 @@
+builtin.module @module_a {
+  func.func @abs(%input: tensor<4096xf32>) -> tensor<4096xf32> {
+    %result = math.absf %input : tensor<4096xf32>
+    return %result : tensor<4096xf32>
+  }
+}
diff --git a/samples/multiple_modules/module_b.mlir b/samples/multiple_modules/module_b.mlir
new file mode 100644
index 0000000..a23365c
--- /dev/null
+++ b/samples/multiple_modules/module_b.mlir
@@ -0,0 +1,9 @@
+builtin.module @module_b {
+  // Note the iree.abi.output indicating that the %output argument is storage
+  // for function result 0.
+  func.func @mul(%lhs: tensor<4096xf32>, %rhs: tensor<4096xf32>,
+                 %output: tensor<4096xf32> {iree.abi.output = 0 : index}) -> tensor<4096xf32> {
+    %result = arith.mulf %lhs, %rhs : tensor<4096xf32>
+    return %result : tensor<4096xf32>
+  }
+}
diff --git a/samples/multiple_modules/pipeline_async.mlir b/samples/multiple_modules/pipeline_async.mlir
new file mode 100644
index 0000000..46ad83c
--- /dev/null
+++ b/samples/multiple_modules/pipeline_async.mlir
@@ -0,0 +1,66 @@
+// RUN: (iree-compile --iree-execution-model=async-external --iree-hal-target-backends=vmvx %p/module_a.mlir -o=%t.module_a.vmfb && \
+// RUN:  iree-compile --iree-execution-model=async-external --iree-hal-target-backends=vmvx %p/module_b.mlir -o=%t.module_b.vmfb && \
+// RUN:  iree-compile --iree-execution-model=async-external --iree-hal-target-backends=vmvx %s | \
+// RUN:  iree-run-module --device=local-task \
+// RUN:    --module=%t.module_a.vmfb \
+// RUN:    --module=%t.module_b.vmfb \
+// RUN:    --module=- --function=run \
+// RUN:    --input=4096xf32=-2.0 \
+// RUN:    --expected_output=4096xf32=4.0) | \
+// RUN:  FileCheck %s
+// CHECK: [SUCCESS]
+
+// Functions declared in external modules - note `module_name.func_name`.
+// `abs` will allocate transient memory to pass back the result.
+// `mul` will use the provided output memory to produce the result in-place.
+// Note that though the returned SSA tensor value shares its storage with the
+// `%output` arg the returned value *must* be used to reference the produced
+// version of its contents.
+//
+// In this asynchronous example both functions follow the "coarse-fences" ABI
+// model where the compiler inserts a wait and signal fence pair on each call.
+// To enable this the modules must compiled with the
+// `--iree-execution-model=async-external` and the external declarations must
+// be annotated with the `iree.abi.model` attribute so that the compiler knows
+// the calls have the fences. Note that it's possible to have any combination of
+// asynchronous and synchronous modules and calls in the same program.
+func.func private @module_a.abs(%input: tensor<4096xf32>) -> tensor<4096xf32> attributes {
+  iree.abi.model = "coarse-fences"
+}
+func.func private @module_b.mul(%lhs: tensor<4096xf32>, %rhs: tensor<4096xf32>, %output: tensor<4096xf32> {iree.abi.output = 0 : index}) -> tensor<4096xf32> attributes {
+  iree.abi.model = "coarse-fences"
+}
+
+// Top-level pipeline invoked by the command line tool.
+// Since this is compiled with `--iree-execution-model=async-external` this
+// export will have a wait and signal fence pair that allows the hosting
+// application to execute the entire pipeline asynchronously.
+func.func @run(%input: tensor<4096xf32>) -> tensor<4096xf32> {
+  // Make a simple call that produces a transient result tensor.
+  // Since the call is asynchronous the result is not ready upon return to this
+  // function and it'll be passed with the fence down to the consumer call.
+  %input_abs = call @module_a.abs(%input) : (tensor<4096xf32>) -> tensor<4096xf32>
+
+  // Allocate output storage for the next call. This isn't needed here and
+  // functionally equivalent to `abs` above allocating its own transient memory
+  // but demonstrates how in-place operations can be performed across module
+  // boundaries. The allocation is asynchronous and will be passed with a fence
+  // indicating when it's ready to the consumer call.
+  %result_storage = tensor.empty() : tensor<4096xf32>
+
+  // Make a call that produces its output in the given `%result_storage`.
+  // The inputs and result storage are passed with their respective fences and
+  // no guarantee that they are available at the time the call is made. The
+  // `mul` implementation will chain its work with the fences and only signal
+  // its fence when all transitive dependencies and its own execution has
+  // completed.
+  %result = call @module_b.mul(%input_abs, %input_abs, %result_storage) : (tensor<4096xf32>, tensor<4096xf32>, tensor<4096xf32>) -> tensor<4096xf32>
+
+  // Return the final result value - note that we pass back the result of the
+  // `mul` call that aliases the `%result_storage` representing the computed
+  // value and not just `%result_storage`. This is required as the `%result` has
+  // an associated fence indicating when it is available for use and using
+  // `%result_storage` would just wait for the storage to be allocated and not
+  // for the contents to have been populated by `mul`.
+  return %result : tensor<4096xf32>
+}
diff --git a/samples/multiple_modules/pipeline_sync.mlir b/samples/multiple_modules/pipeline_sync.mlir
new file mode 100644
index 0000000..3f9a6e0
--- /dev/null
+++ b/samples/multiple_modules/pipeline_sync.mlir
@@ -0,0 +1,40 @@
+// RUN: (iree-compile --iree-hal-target-backends=vmvx %p/module_a.mlir -o=%t.module_a.vmfb && \
+// RUN:  iree-compile --iree-hal-target-backends=vmvx %p/module_b.mlir -o=%t.module_b.vmfb && \
+// RUN:  iree-compile --iree-hal-target-backends=vmvx %s | \
+// RUN:  iree-run-module --device=local-sync \
+// RUN:    --module=%t.module_a.vmfb \
+// RUN:    --module=%t.module_b.vmfb \
+// RUN:    --module=- --function=run \
+// RUN:    --input=4096xf32=-2.0 \
+// RUN:    --expected_output=4096xf32=4.0) | \
+// RUN:  FileCheck %s
+// CHECK: [SUCCESS]
+
+// Functions declared in external modules - note `module_name.func_name`.
+// `abs` will allocate transient memory to pass back the result.
+// `mul` will use the provided output memory to produce the result in-place.
+// Note that though the returned SSA tensor value shares its storage with the
+// `%output` arg the returned value *must* be used to reference the produced
+// version of its contents.
+func.func private @module_a.abs(%input: tensor<4096xf32>) -> tensor<4096xf32>
+func.func private @module_b.mul(%lhs: tensor<4096xf32>, %rhs: tensor<4096xf32>, %output: tensor<4096xf32> {iree.abi.output = 0 : index}) -> tensor<4096xf32>
+
+// Top-level pipeline invoked by the command line tool.
+func.func @run(%input: tensor<4096xf32>) -> tensor<4096xf32> {
+  // Make a simple call that produces a transient result tensor.
+  %input_abs = call @module_a.abs(%input) : (tensor<4096xf32>) -> tensor<4096xf32>
+
+  // Allocate output storage for the next call. This isn't needed here and
+  // functionally equivalent to `abs` above allocating its own transient memory
+  // but demonstrates how in-place operations can be performed across module
+  // boundaries.
+  %result_storage = tensor.empty() : tensor<4096xf32>
+
+  // Make a call that produces its output in the given `%result_storage`.
+  %result = call @module_b.mul(%input_abs, %input_abs, %result_storage) : (tensor<4096xf32>, tensor<4096xf32>, tensor<4096xf32>) -> tensor<4096xf32>
+
+  // Return the final result value - note that we pass back the result of the
+  // `mul` call that aliases the `%result_storage` representing the computed
+  // value and not just `%result_storage`.
+  return %result : tensor<4096xf32>
+}