Synchronize submodules with LLVM at llvm/llvm-project@d32787230d52

Updates LLVM dependencies to match
[d32787230d52](https://github.com/llvm/llvm-project/commit/d32787230d52).
- TensorFlow to
  [239a95692b15](https://github.com/tensorflow/tensorflow/commit/239a95692b15)
- MLIR-HLO to
  [5087ffb61af0](https://github.com/tensorflow/mlir-hlo/commit/5087ffb61af0)

`./scripts/git/update_to_llvm_syncpoint.py `

PiperOrigin-RevId: 412106166
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 23b163f..e02085b 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -4,6 +4,8 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+# Keep this in sync with scripts/lint.sh
+
 name: Lint
 
 on: [pull_request]
diff --git a/build_tools/benchmarks/run_benchmarks_on_android.py b/build_tools/benchmarks/run_benchmarks_on_android.py
index fde034e..5bdeb98 100755
--- a/build_tools/benchmarks/run_benchmarks_on_android.py
+++ b/build_tools/benchmarks/run_benchmarks_on_android.py
@@ -108,7 +108,7 @@
 
 
 def adb_push_to_tmp_dir(content: str,
-                        relative_dir: str,
+                        relative_dir: str = "",
                         verbose: bool = False) -> str:
   """Pushes content onto the Android device.
 
@@ -127,11 +127,13 @@
   return android_path
 
 
-def adb_execute_in_dir(cmd_args: Sequence[str],
-                       relative_dir: str,
-                       verbose: bool = False) -> str:
-  """Executes command with adb shell in a directory, waits for completion,
-  and returns the output.
+def adb_execute_and_get_output(cmd_args: Sequence[str],
+                               relative_dir: str = "",
+                               verbose: bool = False) -> str:
+  """Executes command with adb shell.
+
+  Switches to `relative_dir` relative to the android tmp directory before
+  executing. Waits for completion and returns the command stdout.
 
   Args:
     cmd_args: a list containing the command to execute and its parameters
@@ -142,16 +144,40 @@
     A string for the command output.
   """
   cmd = ["adb", "shell"]
-  cmd.extend(["cd", f"{ANDROID_TMP_DIR}/{relative_dir}"])
+  cmd.extend(["cd", os.path.join(ANDROID_TMP_DIR, relative_dir)])
   cmd.append("&&")
   cmd.extend(cmd_args)
 
   return execute_cmd_and_get_output(cmd, verbose=verbose)
 
 
-def adb_start_in_dir(cmd_args: Sequence[str],
-                     relative_dir: str,
-                     verbose: bool = False) -> subprocess.Popen:
+def adb_execute(cmd_args: Sequence[str],
+                relative_dir: str = "",
+                verbose: bool = False) -> subprocess.CompletedProcess:
+  """Executes command with adb shell.
+
+  Switches to `relative_dir` relative to the android tmp directory before
+  executing. Waits for completion. Output is streamed to the terminal.
+
+  Args:
+    cmd_args: a list containing the command to execute and its parameters
+    relative_dir: the directory to execute the command in; relative to
+      ANDROID_TMP_DIR.
+
+  Returns:
+    The completed process.
+  """
+  cmd = ["adb", "shell"]
+  cmd.extend(["cd", os.path.join(ANDROID_TMP_DIR, relative_dir)])
+  cmd.append("&&")
+  cmd.extend(cmd_args)
+
+  return execute_cmd(cmd, verbose=verbose)
+
+
+def adb_start_cmd(cmd_args: Sequence[str],
+                  relative_dir: str,
+                  verbose: bool = False) -> subprocess.Popen:
   """Executes command with adb shell in a directory and returns the handle
   without waiting for completion.
 
@@ -372,9 +398,9 @@
             "--benchmark_out_format=json",
             f"--benchmark_out='{benchmark_results_basename}'",
         ]
-        result_json = adb_execute_in_dir(cmd,
-                                         android_relative_dir,
-                                         verbose=verbose)
+        result_json = adb_execute_and_get_output(cmd,
+                                                 android_relative_dir,
+                                                 verbose=verbose)
 
         # Pull the result file back onto the host and set the filename for later
         # return.
@@ -400,9 +426,7 @@
 
         # Just launch the traced benchmark tool with TRACY_NO_EXIT=1 without
         # waiting for the adb command to complete as that won't happen.
-        process = adb_start_in_dir(run_cmd,
-                                   android_relative_dir,
-                                   verbose=verbose)
+        process = adb_start_cmd(run_cmd, android_relative_dir, verbose=verbose)
         # But we do need to wait for its start; otherwise will see connection
         # failure when opening the catpure tool. Here we cannot just sleep a
         # certain amount of seconds---Pixel 4 seems to have an issue that will
@@ -523,6 +547,14 @@
   return (benchmark_files, captures, errors)
 
 
+def set_frequency_scaling_governor(governor: str):
+  git_root = execute_cmd_and_get_output(["git", "rev-parse", "--show-toplevel"])
+  cpu_script = os.path.join(
+      git_root, "build_tools/benchmarks/set_android_scaling_governor.sh")
+  adb_push_to_tmp_dir(cpu_script)
+  adb_execute(["su", "root", "./set_android_scaling_governor.sh", governor])
+
+
 def parse_arguments():
   """Parses command-line options."""
 
@@ -581,6 +613,11 @@
                       action="store_true",
                       help="Print internal information during execution")
   parser.add_argument(
+      "--pin-cpu-freq",
+      "--pin_cpu_freq",
+      action="store_true",
+      help="Pin CPU frequency for all cores to the maximum. Requires root")
+  parser.add_argument(
       "--keep_going",
       "--keep-going",
       action="store_true",
@@ -620,6 +657,10 @@
     raise ValueError(f"Unrecognized GPU name: '{device_info.gpu_name}'; "
                      "need to update the map")
 
+  if args.pin_cpu_freq:
+    set_frequency_scaling_governor("performance")
+    atexit.register(set_frequency_scaling_governor, "schedutil")
+
   previous_benchmarks = None
   previous_captures = None
 
diff --git a/build_tools/benchmarks/set_android_scaling_governor.sh b/build_tools/benchmarks/set_android_scaling_governor.sh
new file mode 100755
index 0000000..4f73f87
--- /dev/null
+++ b/build_tools/benchmarks/set_android_scaling_governor.sh
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Runs on an android device itself to set the frequency scaling governor for all
+# CPUs (default performance).
+
+################################### WARNING ####################################
+# This will overheat the phone if it's not on a cooling plate, resulting in    #
+# thermal throttling. To prevent anything catching on fire, the actual CPU     #
+# frequencies will be throttled to below the maximum, skewing your results.    #
+################################################################################
+
+set -euo pipefail
+
+GOVERNOR="${1:-performance}"
+
+echo "CPU info (before changing governor):"
+echo 'cpu\tgovernor\tcur\tmin\tmax'
+echo "------------------------------------------------"
+for i in `cat /sys/devices/system/cpu/present | tr '-' ' ' | xargs seq`; do \
+    echo "cpu${i}" | paste \
+      - \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/scaling_governor" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_cur_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_min_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_max_freq"; \
+done
+
+echo "Setting CPU frequency governor to ${GOVERNOR}"
+
+for i in `cat /sys/devices/system/cpu/present | tr '-' ' ' | xargs seq`; do \
+  echo "${GOVERNOR}" > \
+    "/sys/devices/system/cpu/cpu${i?}/cpufreq/scaling_governor"; \
+done
+
+echo "CPU info (after changing governor):"
+echo 'cpu\tgovernor\tcur\tmin\tmax'
+for i in `cat /sys/devices/system/cpu/present | tr '-' ' ' | xargs seq`; do \
+    echo "cpu${i}" | paste \
+      - \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/scaling_governor" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_cur_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_min_freq" \
+      "/sys/devices/system/cpu/cpu${i}/cpufreq/cpuinfo_max_freq"; \
+done
diff --git a/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml b/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml
index cb0315b..fc2bc27 100644
--- a/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml
+++ b/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml
@@ -32,7 +32,7 @@
       - "tar -xzvf benchmark-suites-${BUILDKITE_BUILD_NUMBER}.tgz"
       - "tar -xzvf iree-android-tools-${BUILDKITE_BUILD_NUMBER}.tgz"
       - "tar -xzvf tracy-capture-058e8901.tgz"
-      - "python3 build_tools/benchmarks/run_benchmarks_on_android.py --normal_benchmark_tool=build-android/iree/tools/iree-benchmark-module --traced_benchmark_tool=build-android-trace/iree/tools/iree-benchmark-module --trace_capture_tool=tracy-capture -o benchmark-results-pixel-4-${BUILDKITE_BUILD_NUMBER}.json --capture_tarball=trace-captures-pixel-4-${BUILDKITE_BUILD_NUMBER}.tgz --verbose build-host/"
+      - "python3 build_tools/benchmarks/run_benchmarks_on_android.py --pin-cpu-freq --normal_benchmark_tool=build-android/iree/tools/iree-benchmark-module --traced_benchmark_tool=build-android-trace/iree/tools/iree-benchmark-module --trace_capture_tool=tracy-capture -o benchmark-results-pixel-4-${BUILDKITE_BUILD_NUMBER}.json --capture_tarball=trace-captures-pixel-4-${BUILDKITE_BUILD_NUMBER}.tgz --verbose build-host/"
     if: "build.pull_request.id == null || (build.pull_request.labels includes 'buildkite:benchmark')"
     agents:
       - "android-soc=snapdragon-855"
diff --git a/build_tools/cmake/iree_copts.cmake b/build_tools/cmake/iree_copts.cmake
index 9e492b7..5826210 100644
--- a/build_tools/cmake/iree_copts.cmake
+++ b/build_tools/cmake/iree_copts.cmake
@@ -125,6 +125,9 @@
     # but it's better to not get spurious failures during LTCG.
     # https://docs.microsoft.com/en-us/cpp/build/reference/bigobj-increase-number-of-sections-in-dot-obj-file
     "/bigobj"
+
+    # Use the modern C preprocessor to more closely match standards/clang/gcc behavior.
+    "/Zc:preprocessor"
 )
 
 # Compiler diagnostics.
diff --git a/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp
index d83c00f..d38ecc6 100644
--- a/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp
+++ b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/ConvertVMToEmitC.cpp
@@ -182,6 +182,7 @@
 }
 
 LogicalResult convertFuncOp(IREE::VM::FuncOp funcOp,
+                            IREE::VM::EmitCTypeConverter &typeConverter,
                             VMAnalysisCache &vmAnalysisCache,
                             SmallVector<BlockArgument, 4> &blockArgsToRemove) {
   auto ctx = funcOp.getContext();
@@ -281,6 +282,8 @@
 
   builder.setInsertionPointToStart(&entryBlock);
 
+  ptr->second.numRefArguments = numRefArgs;
+
   for (int i = 0; i < numLocalRefs; i++) {
     auto refOp = builder.create<emitc::ConstantOp>(
         /*location=*/loc,
@@ -484,8 +487,9 @@
   return elementTypePtrOp;
 }
 
-void releaseLocalRefs(OpBuilder &builder, Location location,
-                      mlir::FuncOp funcOp, VMAnalysisCache &vmAnalysisCache) {
+/// Releases refs which are local to the function as well as ref arguments.
+void releaseRefs(OpBuilder &builder, Location location, mlir::FuncOp funcOp,
+                 VMAnalysisCache &vmAnalysisCache) {
   auto ctx = builder.getContext();
 
   auto ptr = vmAnalysisCache.find(funcOp.getOperation());
@@ -507,6 +511,24 @@
         /*templateArgs=*/ArrayAttr{},
         /*operands=*/ArrayRef<Value>{localRef});
   }
+
+  // We only release the original arguments not the results which were appended
+  // as further operands.
+  size_t refArgumentsReleased = 0;
+  for (auto arg : funcOp.getArguments()) {
+    if (arg.getType() == emitc::OpaqueType::get(ctx, "iree_vm_ref_t*")) {
+      if (ptr->second.numRefArguments <= refArgumentsReleased++) {
+        break;
+      }
+      builder.create<emitc::CallOp>(
+          /*location=*/location,
+          /*type=*/TypeRange{},
+          /*callee=*/StringAttr::get(ctx, "iree_vm_ref_release"),
+          /*args=*/ArrayAttr{},
+          /*templateArgs=*/ArrayAttr{},
+          /*operands=*/ArrayRef<Value>{arg});
+    }
+  }
 }
 
 /// Generate an emitc.call op with one result and split the current block into a
@@ -575,7 +597,7 @@
     Block *block = builder.getBlock();
     mlir::FuncOp funcOp = cast<mlir::FuncOp>(block->getParentOp());
 
-    releaseLocalRefs(builder, location, funcOp, vmAnalysisCache);
+    releaseRefs(builder, location, funcOp, vmAnalysisCache);
 
     builder.create<mlir::ReturnOp>(location, callOp.getResult(0));
   };
@@ -597,7 +619,7 @@
     Block *block = builder.getBlock();
     mlir::FuncOp funcOp = cast<mlir::FuncOp>(block->getParentOp());
 
-    releaseLocalRefs(builder, location, funcOp, vmAnalysisCache);
+    releaseRefs(builder, location, funcOp, vmAnalysisCache);
 
     auto statusOp = builder.create<emitc::CallOp>(
         /*location=*/location,
@@ -678,7 +700,7 @@
     Block *block = builder.getBlock();
     mlir::FuncOp funcOp = cast<mlir::FuncOp>(block->getParentOp());
 
-    releaseLocalRefs(builder, location, funcOp, vmAnalysisCache);
+    releaseRefs(builder, location, funcOp, vmAnalysisCache);
 
     builder.create<mlir::ReturnOp>(location, callOp.getResult(0));
   };
@@ -913,7 +935,7 @@
                           buffer.getResult(0)});
     }
 
-    // Zero out refs
+    // Zero out refs from state struct.
     auto ordinal_counts = moduleOp.ordinal_counts();
 
     if (!ordinal_counts.hasValue()) {
@@ -1025,6 +1047,46 @@
         /*templateArgs=*/ArrayAttr{},
         /*operands=*/ArrayRef<Value>{funcOp.getArgument(1)});
 
+    // Release refs from state struct.
+    auto ordinal_counts = moduleOp.ordinal_counts();
+
+    if (!ordinal_counts.hasValue()) {
+      return moduleOp.emitError()
+             << "ordinal_counts attribute not found. The OrdinalAllocationPass "
+                "must be run before.";
+    }
+    const int numGlobalRefs = ordinal_counts.getValue().global_refs();
+
+    auto refs = builder.create<emitc::CallOp>(
+        /*location=*/loc,
+        /*type=*/emitc::OpaqueType::get(ctx, "iree_vm_ref_t*"),
+        /*callee=*/StringAttr::get(ctx, "EMITC_STRUCT_PTR_MEMBER"),
+        /*args=*/
+        ArrayAttr::get(ctx, {builder.getIndexAttr(0),
+                             emitc::OpaqueAttr::get(ctx, "refs")}),
+        /*templateArgs=*/ArrayAttr{},
+        /*operands=*/ArrayRef<Value>{stateOp.getResult(0)});
+
+    for (int i = 0; i < numGlobalRefs; i++) {
+      auto refPtrOp = builder.create<emitc::CallOp>(
+          /*location=*/loc,
+          /*type=*/emitc::OpaqueType::get(ctx, "iree_vm_ref_t*"),
+          /*callee=*/StringAttr::get(ctx, "EMITC_ARRAY_ELEMENT_ADDRESS"),
+          /*args=*/
+          ArrayAttr::get(
+              ctx, {builder.getIndexAttr(0), builder.getUI32IntegerAttr(i)}),
+          /*templateArgs=*/ArrayAttr{},
+          /*operands=*/ArrayRef<Value>{refs.getResult(0)});
+
+      builder.create<emitc::CallOp>(
+          /*location=*/loc,
+          /*type=*/TypeRange{},
+          /*callee=*/StringAttr::get(ctx, "iree_vm_ref_release"),
+          /*args=*/ArrayAttr{},
+          /*templateArgs=*/ArrayAttr{},
+          /*operands=*/ArrayRef<Value>{refPtrOp.getResult(0)});
+    }
+
     auto allocatorOp = builder.create<emitc::CallOp>(
         /*location=*/loc,
         /*type=*/emitc::OpaqueType::get(ctx, "iree_allocator_t"),
@@ -1416,18 +1478,20 @@
   VMAnalysisCache &vmAnalysisCache;
 };
 
-class CallOpConversion : public OpConversionPattern<IREE::VM::CallOp> {
+template <typename CallOpTy>
+class CallOpConversion : public OpConversionPattern<CallOpTy> {
  public:
-  using OpConversionPattern<IREE::VM::CallOp>::OpConversionPattern;
+  using Adaptor = typename CallOpTy::Adaptor;
+  using OpConversionPattern<CallOpTy>::OpConversionPattern;
 
   CallOpConversion(TypeConverter &typeConverter, MLIRContext *context,
                    VMAnalysisCache &vmAnalysisCache)
-      : OpConversionPattern<IREE::VM::CallOp>(typeConverter, context),
+      : OpConversionPattern<CallOpTy>(typeConverter, context),
         vmAnalysisCache(vmAnalysisCache) {}
 
  private:
   LogicalResult matchAndRewrite(
-      IREE::VM::CallOp op, OpAdaptor adaptor,
+      CallOpTy op, Adaptor adaptor,
       ConversionPatternRewriter &rewriter) const override {
     mlir::FuncOp funcOp =
         lookupSymbolRef<mlir::FuncOp>(op.getOperation(), "callee");
@@ -1442,19 +1506,21 @@
 
     const bool isImported = importOp != nullptr;
 
-    return isImported ? rewriteImportedCall(op, adaptor, rewriter, importOp)
-                      : rewriteInternalCall(op, adaptor, rewriter, funcOp);
+    return isImported ? rewriteImportedCall(op.getOperation(), adaptor,
+                                            rewriter, importOp)
+                      : rewriteInternalCall(op.getOperation(), adaptor,
+                                            rewriter, funcOp);
   }
 
-  LogicalResult rewriteInternalCall(IREE::VM::CallOp op, OpAdaptor adaptor,
+  LogicalResult rewriteInternalCall(Operation *op, Adaptor adaptor,
                                     ConversionPatternRewriter &rewriter,
                                     mlir::FuncOp funcOp) const {
-    auto loc = op.getLoc();
+    auto loc = op->getLoc();
 
     SmallVector<Value, 4> updatedOperands;
     SmallVector<Value, 4> resultOperands;
 
-    auto parentFuncOp = op.getOperation()->getParentOfType<mlir::FuncOp>();
+    auto parentFuncOp = op->getParentOfType<mlir::FuncOp>();
 
     BlockArgument stackArg = parentFuncOp.getArgument(0);
     BlockArgument moduleArg = parentFuncOp.getArgument(1);
@@ -1462,7 +1528,7 @@
 
     updatedOperands = {stackArg, moduleArg, moduleStateArg};
 
-    if (failed(updateOperands(op, op.getOperands(), rewriter, updatedOperands,
+    if (failed(updateOperands(op, op->getOperands(), rewriter, updatedOperands,
                               resultOperands))) {
       return failure();
     };
@@ -1480,11 +1546,11 @@
     return success();
   }
 
-  LogicalResult rewriteImportedCall(IREE::VM::CallOp op, OpAdaptor adaptor,
+  LogicalResult rewriteImportedCall(Operation *op, Adaptor adaptor,
                                     ConversionPatternRewriter &rewriter,
                                     IREE::VM::ImportOp importOp) const {
-    auto ctx = op.getContext();
-    auto loc = op.getLoc();
+    auto ctx = op->getContext();
+    auto loc = op->getLoc();
 
     SmallVector<Value, 4> updatedOperands;
     SmallVector<Value, 4> resultOperands;
@@ -1495,11 +1561,11 @@
     Optional<std::string> funcName = buildFunctionName(moduleOp, importOp);
 
     if (!funcName.hasValue())
-      return op.emitError() << "Couldn't build name to imported function";
+      return op->emitError() << "Couldn't build name to imported function";
 
     int importOrdinal = importOp.ordinal().getValue().getZExtValue();
 
-    auto funcOp = op.getOperation()->getParentOfType<mlir::FuncOp>();
+    auto funcOp = op->getParentOfType<mlir::FuncOp>();
     BlockArgument stackArg = funcOp.getArgument(0);
     BlockArgument stateArg = funcOp.getArgument(2);
 
@@ -1525,7 +1591,38 @@
 
     updatedOperands = {stackArg, import.getResult(0)};
 
-    if (failed(updateOperands(op, op.getOperands(), rewriter, updatedOperands,
+    auto isVariadic = [](APInt segmentSize) {
+      return segmentSize.getSExtValue() != -1;
+    };
+
+    if (auto variadicOp = dyn_cast<IREE::VM::CallVariadicOp>(op)) {
+      DenseIntElementsAttr segmentSizes = variadicOp.segment_sizes();
+      size_t numSegments = segmentSizes.size();
+      size_t numVariadicSegments = llvm::count_if(segmentSizes, isVariadic);
+
+      if (numVariadicSegments != 1) {
+        return op->emitError() << "only exactly one variadic segment supported";
+      }
+
+      auto lastSegmentSize = *(segmentSizes.begin() + (numSegments - 1));
+
+      if (!isVariadic(lastSegmentSize)) {
+        return op->emitError() << "expected the last segment to be variadic";
+      }
+
+      size_t numSpans = lastSegmentSize.getSExtValue();
+
+      // TODO(simon-camp): The generated code would be cleaner if we used the
+      // args attribute of the call op to specify the constant.
+      auto numSpansOp = rewriter.create<emitc::ConstantOp>(
+          /*location=*/loc,
+          /*resultType=*/rewriter.getIndexType(),
+          /*value=*/rewriter.getIndexAttr(numSpans));
+
+      updatedOperands.push_back(numSpansOp.getResult());
+    }
+
+    if (failed(updateOperands(op, op->getOperands(), rewriter, updatedOperands,
                               resultOperands))) {
       return failure();
     }
@@ -1547,17 +1644,17 @@
     return success();
   }
 
-  LogicalResult updateOperands(IREE::VM::CallOp op, OperandRange operands,
+  LogicalResult updateOperands(Operation *op, OperandRange operands,
                                ConversionPatternRewriter &rewriter,
                                SmallVector<Value, 4> &updatedOperands,
                                SmallVector<Value, 4> &resultOperands) const {
-    auto ctx = op.getContext();
-    auto loc = op.getLoc();
+    auto ctx = op->getContext();
+    auto loc = op->getLoc();
 
-    auto funcOp = op.getOperation()->getParentOfType<mlir::FuncOp>();
+    auto funcOp = op->getParentOfType<mlir::FuncOp>();
     auto ptr = vmAnalysisCache.find(funcOp.getOperation());
     if (ptr == vmAnalysisCache.end()) {
-      return op.emitError() << "parent func op not found in cache.";
+      return op->emitError() << "parent func op not found in cache.";
     }
 
     for (Value operand : operands) {
@@ -1565,6 +1662,12 @@
              emitc::OpaqueType::get(ctx, "iree_vm_ref_t*"));
 
       if (operand.getType().isa<IREE::VM::RefType>()) {
+        Optional<Value> operandRef = findRef(funcOp, vmAnalysisCache, operand);
+
+        if (!operandRef.hasValue()) {
+          return op->emitError() << "local ref not found";
+        }
+
         auto refOp = rewriter.create<emitc::ConstantOp>(
             /*location=*/loc,
             /*resultType=*/emitc::OpaqueType::get(ctx, "iree_vm_ref_t"),
@@ -1572,7 +1675,7 @@
 
         auto refPtrOp = rewriter.create<emitc::ApplyOp>(
             /*location=*/loc,
-            /*type=*/emitc::OpaqueType::get(ctx, "iree_vm_ref_t*"),
+            /*result=*/emitc::OpaqueType::get(ctx, "iree_vm_ref_t*"),
             /*applicableOperator=*/StringAttr::get(ctx, "&"),
             /*operand=*/refOp.getResult());
 
@@ -1581,22 +1684,11 @@
           return failure();
         }
 
-        bool move = ptr->second.isLastValueUse(operand, op.getOperation());
-
-        Optional<Value> operandRef = findRef(funcOp, vmAnalysisCache, operand);
-
-        if (!operandRef.hasValue()) {
-          return op.emitError() << "local ref not found";
-        }
-
         rewriter.create<emitc::CallOp>(
             /*location=*/loc,
             /*type=*/TypeRange{},
-            /*callee=*/StringAttr::get(ctx, "iree_vm_ref_retain_or_move"),
-            /*args=*/
-            ArrayAttr::get(
-                ctx, {rewriter.getBoolAttr(move), rewriter.getIndexAttr(0),
-                      rewriter.getIndexAttr(1)}),
+            /*callee=*/StringAttr::get(ctx, "iree_vm_ref_assign"),
+            /*args=*/ArrayAttr{},
             /*templateArgs=*/ArrayAttr{},
             /*operands=*/
             ArrayRef<Value>{operandRef.getValue(), refPtrOp.getResult()});
@@ -1609,27 +1701,25 @@
 
     // Create a variable for every result and a pointer to it as output
     // parameter to the call.
-    for (OpResult result : op.getResults()) {
-      emitc::ConstantOp resultOp;
-
+    for (OpResult result : op->getResults()) {
       if (result.getType().isa<IREE::VM::RefType>()) {
         Optional<Value> ref = findRef(funcOp, vmAnalysisCache, result);
 
         if (!ref.hasValue()) {
-          return op.emitError() << "local ref not found";
+          return op->emitError() << "local ref not found";
         }
 
         resultOperands.push_back(ref.getValue());
         updatedOperands.push_back(ref.getValue());
       } else {
-        resultOp = rewriter.create<emitc::ConstantOp>(
+        auto resultOp = rewriter.create<emitc::ConstantOp>(
             /*location=*/loc,
             /*resultType=*/result.getType(),
             /*value=*/emitc::OpaqueAttr::get(ctx, ""));
 
         Optional<std::string> cType = getCType(resultOp.getType());
         if (!cType.hasValue()) {
-          return op.emitError() << "unable to emit C type";
+          return op->emitError() << "unable to emit C type";
         }
 
         std::string cPtrType = cType.getValue() + std::string("*");
@@ -1646,38 +1736,20 @@
     return success();
   }
 
-  LogicalResult updateResults(IREE::VM::CallOp op,
+  LogicalResult updateResults(Operation *op,
                               ConversionPatternRewriter &rewriter,
                               SmallVector<Value, 4> &resultOperands) const {
-    auto ctx = op.getContext();
-    auto loc = op.getLoc();
-
-    auto funcOp = op.getOperation()->getParentOfType<mlir::FuncOp>();
+    auto funcOp = op->getParentOfType<mlir::FuncOp>();
     auto ptr = vmAnalysisCache.find(funcOp.getOperation());
     if (ptr == vmAnalysisCache.end()) {
-      return op.emitError() << "parent func op not found in cache.";
+      return op->emitError() << "parent func op not found in cache.";
     }
 
-    for (auto &pair : llvm::enumerate(op.getResults())) {
+    for (auto &pair : llvm::enumerate(op->getResults())) {
       size_t index = pair.index();
       OpResult result = pair.value();
 
-      if (result.getType().isa<IREE::VM::RefType>()) {
-        Optional<Value> ref = findRef(funcOp, vmAnalysisCache, result);
-
-        if (!ref.hasValue()) {
-          return op.emitError() << "local ref not found";
-        }
-
-        rewriter.create<emitc::CallOp>(
-            /*location=*/loc,
-            /*type=*/TypeRange{},
-            /*callee=*/StringAttr::get(ctx, "iree_vm_ref_retain"),
-            /*args=*/ArrayAttr{},
-            /*templateArgs=*/ArrayAttr{},
-            /*operands=*/
-            ArrayRef<Value>{resultOperands[index], ref.getValue()});
-      } else {
+      if (!result.getType().isa<IREE::VM::RefType>()) {
         result.replaceAllUsesWith(resultOperands[index]);
       }
     }
@@ -1714,9 +1786,9 @@
     }
 
     bool moveLhs =
-        ptr->second.isLastValueUse(cmpOp.lhs(), cmpOp.getOperation());
+        ptr->second.isLastValueUse(cmpOp.lhs(), cmpOp.getOperation()) && false;
     bool moveRhs =
-        ptr->second.isLastValueUse(cmpOp.rhs(), cmpOp.getOperation());
+        ptr->second.isLastValueUse(cmpOp.rhs(), cmpOp.getOperation()) && false;
 
     Optional<Value> refLhs = findRef(funcOp, vmAnalysisCache, cmpOp.lhs());
 
@@ -1793,7 +1865,8 @@
     }
 
     bool move =
-        ptr->second.isLastValueUse(cmpOp.operand(), cmpOp.getOperation());
+        ptr->second.isLastValueUse(cmpOp.operand(), cmpOp.getOperation()) &&
+        false;
 
     Optional<Value> ref = findRef(funcOp, vmAnalysisCache, cmpOp.operand());
 
@@ -2005,6 +2078,29 @@
     auto ctx = op.getContext();
     auto loc = op.getLoc();
 
+    assert(op.getOperands().size() == adaptor.getOperands().size());
+
+    auto isNotRefOperand = [](Value operand) {
+      return !operand.getType().isa<IREE::VM::RefType>();
+    };
+
+    SmallVector<Value> nonRefOperands;
+    for (Value operand : op.getOperands()) {
+      if (isNotRefOperand(operand)) {
+        nonRefOperands.push_back(operand);
+      }
+    }
+
+    Block *dest = op.getDest();
+
+    // If we don't have ref block arguments, we can convert the operation
+    // directly.
+    if (adaptor.getOperands().size() == nonRefOperands.size()) {
+      rewriter.replaceOpWithNewOp<mlir::BranchOp>(op, op.dest(),
+                                                  op.getOperands());
+      return success();
+    }
+
     auto funcOp = op.getOperation()->getParentOfType<mlir::FuncOp>();
 
     auto ptr = vmAnalysisCache.find(funcOp.getOperation());
@@ -2012,32 +2108,47 @@
       return op->emitError() << "parent func op not found in cache.";
     }
 
-    for (Value operand : op.getOperands()) {
-      if (!operand.getType().isa<IREE::VM::RefType>()) {
-        continue;
+    Block *destDispatch;
+    {
+      OpBuilder::InsertionGuard guard(rewriter);
+      destDispatch = rewriter.createBlock(dest);
+
+      for (auto pair : llvm::zip(op.getOperands(), dest->getArguments())) {
+        Value operand = std::get<0>(pair);
+        BlockArgument blockArg = std::get<1>(pair);
+
+        if (isNotRefOperand(operand)) {
+          continue;
+        }
+
+        assert(operand.getType().isa<IREE::VM::RefType>());
+        assert(blockArg.getType().isa<IREE::VM::RefType>());
+
+        Optional<Value> operandRef = findRef(funcOp, vmAnalysisCache, operand);
+        Optional<Value> blockArgRef =
+            findRef(funcOp, vmAnalysisCache, blockArg);
+
+        if (!operandRef.hasValue()) {
+          return op.emitError() << "local ref not found";
+        }
+        if (!blockArgRef.hasValue()) {
+          return op.emitError() << "local ref not found";
+        }
+
+        rewriter.create<emitc::CallOp>(
+            /*location=*/loc,
+            /*type=*/TypeRange{},
+            /*callee=*/
+            StringAttr::get(ctx, "iree_vm_ref_retain"),
+            /*args=*/ArrayAttr{},
+            /*templateArgs=*/ArrayAttr{},
+            /*operands=*/
+            ArrayRef<Value>{operandRef.getValue(), blockArgRef.getValue()});
       }
-
-      assert(operand.getType() ==
-             emitc::OpaqueType::get(ctx, "iree_vm_ref_t*"));
-
-      Optional<Value> destRef = findRef(funcOp, vmAnalysisCache, operand);
-
-      if (!destRef.hasValue()) {
-        return op->emitError() << "local ref not found";
-      }
-
-      rewriter.create<emitc::CallOp>(
-          /*location=*/loc,
-          /*type=*/TypeRange{},
-          /*callee=*/
-          StringAttr::get(ctx, "iree_vm_ref_retain"),
-          /*args=*/ArrayAttr{},
-          /*templateArgs=*/ArrayAttr{},
-          /*operands=*/ArrayRef<Value>{operand, destRef.getValue()});
+      rewriter.create<mlir::BranchOp>(loc, op.dest(), op.getOperands());
     }
 
-    rewriter.replaceOpWithNewOp<mlir::BranchOp>(op, op.dest(),
-                                                op.getOperands());
+    rewriter.replaceOpWithNewOp<mlir::BranchOp>(op, destDispatch);
 
     return success();
   }
@@ -2244,21 +2355,33 @@
 
     // The result variables are the last N arguments of the function.
     unsigned int firstOutputArgumentIndex =
-        funcOp.getNumArguments() - adaptor.getOperands().size();
+        funcOp.getNumArguments() - op.getOperands().size();
 
-    for (auto &operand : llvm::enumerate(adaptor.getOperands())) {
-      unsigned int argumentIndex = firstOutputArgumentIndex + operand.index();
+    for (auto &pair : llvm::enumerate(op.getOperands())) {
+      Value operand = pair.value();
+      size_t index = pair.index();
+
+      unsigned int argumentIndex = firstOutputArgumentIndex + index;
       BlockArgument resultArgument = funcOp.getArgument(argumentIndex);
 
-      if (operand.value().getType() ==
-          emitc::OpaqueType::get(ctx, "iree_vm_ref_t*")) {
+      if (operand.getType().isa<IREE::VM::RefType>()) {
+        assert(operand.getType() !=
+               emitc::OpaqueType::get(ctx, "iree_vm_ref_t*"));
+
+        Optional<Value> operandRef = findRef(funcOp, vmAnalysisCache, operand);
+
+        if (!operandRef.hasValue()) {
+          return op->emitError() << "local ref not found";
+        }
+
         rewriter.create<emitc::CallOp>(
             /*location=*/loc,
             /*type=*/TypeRange{},
             /*callee=*/StringAttr::get(ctx, "iree_vm_ref_move"),
             /*args=*/ArrayAttr{},
             /*templateArgs=*/ArrayAttr{},
-            /*operands=*/ArrayRef<Value>{operand.value(), resultArgument});
+            /*operands=*/
+            ArrayRef<Value>{operandRef.getValue(), resultArgument});
       } else {
         rewriter.create<emitc::CallOp>(
             /*location=*/loc,
@@ -2266,11 +2389,11 @@
             /*callee=*/StringAttr::get(ctx, "EMITC_DEREF_ASSIGN_VALUE"),
             /*args=*/ArrayAttr{},
             /*templateArgs=*/ArrayAttr{},
-            /*operands=*/ArrayRef<Value>{resultArgument, operand.value()});
+            /*operands=*/ArrayRef<Value>{resultArgument, operand});
       }
     }
 
-    releaseLocalRefs(rewriter, loc, funcOp, vmAnalysisCache);
+    releaseRefs(rewriter, loc, funcOp, vmAnalysisCache);
 
     auto status = rewriter.create<emitc::CallOp>(
         /*location=*/loc,
@@ -2314,7 +2437,7 @@
 
       auto funcOp = op.getOperation()->getParentOfType<mlir::FuncOp>();
 
-      releaseLocalRefs(rewriter, loc, funcOp, vmAnalysisCache);
+      releaseRefs(rewriter, loc, funcOp, vmAnalysisCache);
 
       auto status = rewriter.create<emitc::CallOp>(
           /*location=*/loc,
@@ -2333,7 +2456,7 @@
 
       auto funcOp = op.getOperation()->getParentOfType<mlir::FuncOp>();
 
-      releaseLocalRefs(rewriter, loc, funcOp, vmAnalysisCache);
+      releaseRefs(rewriter, loc, funcOp, vmAnalysisCache);
 
       std::string message = std::string("\"") +
                             op.message().getValueOr("").str() +
@@ -2563,7 +2686,7 @@
     Value srcRef = isLoad ? stateRef.getResult(0) : localRef.getValue();
     Value destRef = isLoad ? localRef.getValue() : stateRef.getResult(0);
 
-    bool move = ptr->second.isLastValueUse(localValue, op);
+    bool move = ptr->second.isLastValueUse(localValue, op) && false;
 
     returnIfError(
         /*rewriter=*/rewriter,
@@ -2640,7 +2763,7 @@
              rewriter.getIndexAttr(1)}),
         /*templateArgs=*/ArrayAttr{},
         /*operands=*/
-        ArrayRef<Value>{rwDataPtr.getResult(0), adaptor.getOperands()[0]});
+        ArrayRef<Value>{rwDataPtr.getResult(0), storeOp.value()});
 
     return success();
   }
@@ -3239,7 +3362,9 @@
       return setOp.emitError() << "parent func op not found in cache.";
     }
 
-    bool move = ptr->second.isLastValueUse(setOp.value(), setOp.getOperation());
+    bool move =
+        ptr->second.isLastValueUse(setOp.value(), setOp.getOperation()) &&
+        false;
 
     StringRef callee =
         move ? "iree_vm_list_set_ref_move" : "iree_vm_list_set_ref_retain";
@@ -3274,7 +3399,10 @@
 
   // CFG
   patterns.insert<BranchOpConversion>(typeConverter, context, vmAnalysisCache);
-  patterns.insert<CallOpConversion>(typeConverter, context, vmAnalysisCache);
+  patterns.insert<CallOpConversion<IREE::VM::CallOp>>(typeConverter, context,
+                                                      vmAnalysisCache);
+  patterns.insert<CallOpConversion<IREE::VM::CallVariadicOp>>(
+      typeConverter, context, vmAnalysisCache);
   patterns.insert<CondBranchOpConversion>(typeConverter, context,
                                           vmAnalysisCache);
   patterns.insert<FailOpConversion>(typeConverter, context, vmAnalysisCache);
@@ -3638,7 +3766,8 @@
       vmAnalysisCache.insert(std::make_pair(
           op, VMAnalysis{RegisterAllocation(op), ValueLiveness(op)}));
 
-      if (failed(convertFuncOp(funcOp, vmAnalysisCache, blockArgsToRemove))) {
+      if (failed(convertFuncOp(funcOp, typeConverter, vmAnalysisCache,
+                               blockArgsToRemove))) {
         return signalPassFailure();
       }
       funcsToRemove.push_back(funcOp);
diff --git a/iree/compiler/Dialect/VM/Conversion/VMToEmitC/VMAnalysis.h b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/VMAnalysis.h
index d127882..d14a26c 100644
--- a/iree/compiler/Dialect/VM/Conversion/VMToEmitC/VMAnalysis.h
+++ b/iree/compiler/Dialect/VM/Conversion/VMToEmitC/VMAnalysis.h
@@ -53,6 +53,7 @@
   }
 
   DenseMap<int64_t, Operation *> &localRefs() { return refs; }
+  size_t numRefArguments;
 
  private:
   RegisterAllocation registerAllocation;
diff --git a/iree/compiler/Dialect/VM/Target/C/CModuleTarget.cpp b/iree/compiler/Dialect/VM/Target/C/CModuleTarget.cpp
index 0effb7a..080280e 100644
--- a/iree/compiler/Dialect/VM/Target/C/CModuleTarget.cpp
+++ b/iree/compiler/Dialect/VM/Target/C/CModuleTarget.cpp
@@ -70,7 +70,7 @@
     output << "iree_alignas(" << alignment << ") static const uint8_t "
            << bufferName << "[] = {";
     llvm::interleaveComma(byteBuffer, output, [&](char value) {
-      output << static_cast<unsigned int>(value);
+      output << static_cast<unsigned int>(static_cast<unsigned char>(value));
     });
     output << "};\n";
   }
@@ -138,6 +138,8 @@
   auto printStringView = [](StringRef s) -> std::string {
     // We can't use iree_make_string_view because function calls are not allowed
     // for constant expressions in C.
+    // TODO(#7605): Switch to IREE_SVL. We can't use IREE_SVL today because it
+    // uses designated initializers, which cause issues when compiled as C++.
     return ("{\"" + s + "\", " + std::to_string(s.size()) + "}").str();
   };
 
@@ -145,8 +147,6 @@
   SmallVector<IREE::VM::ExportOp, 4> exportOps(
       moduleOp.getOps<IREE::VM::ExportOp>());
   std::string exportName = moduleName + "_exports_";
-  output << "static const size_t " << exportName
-         << "count_ = " << exportOps.size() << ";\n";
   output << "static const iree_vm_native_export_descriptor_t " << exportName
          << "[] = {\n";
   if (exportOps.empty()) {
@@ -183,17 +183,16 @@
   SmallVector<IREE::VM::ImportOp, 4> importOps(
       moduleOp.getOps<IREE::VM::ImportOp>());
   std::string importName = moduleName + "_imports_";
-  output << "static const size_t " << importName
-         << "count_ = " << importOps.size() << ";\n";
   output << "static const iree_vm_native_import_descriptor_t " << importName
          << "[] = {\n";
   if (importOps.empty()) {
     // Empty list placeholder.
     output << "    {0},\n";
   } else {
-    // sort import ops
+    // sort import ops by ordinal
     llvm::sort(importOps, [](auto &lhs, auto &rhs) {
-      return lhs.getName().compare(rhs.getName()) < 0;
+      return lhs.ordinal().getValue().getZExtValue() <
+             rhs.ordinal().getValue().getZExtValue();
     });
     for (auto importOp : importOps) {
       output << "{" << printStringView(importOp.getName()) << "},\n";
@@ -204,8 +203,6 @@
 
   // functions
   std::string functionName = moduleName + "_funcs_";
-  output << "static const size_t " << functionName
-         << "count_ = " << exportOps.size() << ";\n";
   output << "static const iree_vm_native_function_ptr_t " << functionName
          << "[] = {\n";
   if (exportOps.empty()) {
@@ -240,11 +237,11 @@
   output << "static const iree_vm_native_module_descriptor_t " << descriptorName
          << " = {\n"
          << printStringView(moduleName) << ",\n"
-         << importName << "count_,\n"
+         << importOps.size() << ",\n"
          << importName << ",\n"
-         << exportName << "count_,\n"
+         << exportOps.size() << ",\n"
          << exportName << ",\n"
-         << functionName << "count_,\n"
+         << exportOps.size() << ",\n"
          << functionName << ",\n"
          << "0,\n"
          << "NULL,\n"
diff --git a/iree/samples/static_library/CMakeLists.txt b/iree/samples/static_library/CMakeLists.txt
index a063dd5..1b57d24 100644
--- a/iree/samples/static_library/CMakeLists.txt
+++ b/iree/samples/static_library/CMakeLists.txt
@@ -13,6 +13,7 @@
 # Set iree-translate binary.
 set(_TRANSLATE_TOOL_EXECUTABLE $<TARGET_FILE:iree_tools_iree-translate>)
 
+## Example with VM bytecode module.
 # Setup args for iree-translate.
 set(_TRANSLATE_ARGS)
 list(APPEND _TRANSLATE_ARGS "-iree-mlir-to-vm-bytecode-module")
@@ -71,13 +72,13 @@
 NAME
   static_library_demo
 SRCS
+  "create_bytecode_module.c"
   "static_library_demo.c"
 DEPS
   ::simple_mul_c
   iree::runtime
   iree::hal::local::loaders::static_library_loader
   iree::hal::local::sync_driver
-  iree::task::api
   simple_mul
 )
 
@@ -92,3 +93,77 @@
   LABELS
     "hostonly"
 )
+
+
+if(NOT (${IREE_ENABLE_EMITC} OR DEFINED IREE_HOST_BINARY_ROOT))
+  return()
+endif()
+
+## Example with VM C module.
+# Setup args for iree-translate.
+set(_TRANSLATE_ARGS)
+list(APPEND _TRANSLATE_ARGS "-iree-mlir-to-vm-c-module")
+list(APPEND _TRANSLATE_ARGS "-iree-hal-target-backends=dylib-llvm-aot")
+list(APPEND _TRANSLATE_ARGS "-iree-llvm-link-embedded=false")
+list(APPEND _TRANSLATE_ARGS "-iree-llvm-link-static")
+list(APPEND _TRANSLATE_ARGS "-iree-llvm-static-library-output-path=simple_mul_c_module.o")
+list(APPEND _TRANSLATE_ARGS "${CMAKE_CURRENT_SOURCE_DIR}/simple_mul.mlir")
+list(APPEND _TRANSLATE_ARGS "-o")
+list(APPEND _TRANSLATE_ARGS "simple_mul_emitc.h")
+
+# Custom command for iree-translate to generate static library and C module.
+add_custom_command(
+  OUTPUT
+    ${CMAKE_CURRENT_BINARY_DIR}/simple_mul_c_module.h
+    ${CMAKE_CURRENT_BINARY_DIR}/simple_mul_c_module.o
+    ${CMAKE_CURRENT_BINARY_DIR}/simple_mul_emitc.h
+  COMMAND ${_TRANSLATE_TOOL_EXECUTABLE} ${_TRANSLATE_ARGS}
+  DEPENDS ${_TRANSLATE_TOOL_EXECUTABLE} "simple_mul.mlir"
+)
+
+# TODO(marbre): Cleanup custom targets and libraries.
+add_custom_target(
+  simple_mul_gen
+  DEPENDS
+    "simple_mul_emitc.h"
+)
+
+add_library(simple_mul_c_module
+ STATIC
+ ${CMAKE_CURRENT_BINARY_DIR}/simple_mul_c_module.o
+)
+add_dependencies(simple_mul_c_module simple_mul_gen)
+
+SET_TARGET_PROPERTIES(
+  simple_mul_c_module
+  PROPERTIES
+  LINKER_LANGUAGE C
+)
+
+# TODO(marbre): Cleanup SRCS and DEPS.
+iree_cc_binary(
+NAME
+  static_library_demo_c
+SRCS
+  "create_c_module.c"
+  "static_library_demo.c"
+  "simple_mul_emitc.h"
+DEPS
+  iree::runtime
+  iree::hal::local::loaders::static_library_loader
+  iree::hal::local::sync_driver
+  iree::vm::shims_emitc
+  simple_mul_c_module
+)
+
+iree_lit_test(
+  NAME
+    static_library_demo_c_test
+  TEST_FILE
+    "static_library_demo_c_test.txt"
+  DATA
+    ::static_library_demo_c
+    iree::tools::IreeFileCheck
+  LABELS
+    "hostonly"
+)
diff --git a/iree/samples/static_library/create_bytecode_module.c b/iree/samples/static_library/create_bytecode_module.c
new file mode 100644
index 0000000..70e31f2
--- /dev/null
+++ b/iree/samples/static_library/create_bytecode_module.c
@@ -0,0 +1,22 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include <stdio.h>
+
+#include "iree/samples/static_library/simple_mul_c.h"
+#include "iree/vm/bytecode_module.h"
+
+// A function to create the bytecode module.
+iree_status_t create_module(iree_vm_module_t** module) {
+  const struct iree_file_toc_t* module_file_toc =
+      iree_samples_static_library_simple_mul_create();
+  iree_const_byte_span_t module_data =
+      iree_make_const_byte_span(module_file_toc->data, module_file_toc->size);
+
+  return iree_vm_bytecode_module_create(module_data, iree_allocator_null(),
+                                        iree_allocator_system(), module);
+}
+
+void print_success() { printf("static_library_run_bytecode passed\n"); }
diff --git a/iree/samples/static_library/create_c_module.c b/iree/samples/static_library/create_c_module.c
new file mode 100644
index 0000000..c4939f2
--- /dev/null
+++ b/iree/samples/static_library/create_c_module.c
@@ -0,0 +1,15 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include <stdio.h>
+
+#include "iree/samples/static_library/simple_mul_emitc.h"
+
+// A function to create the C module.
+iree_status_t create_module(iree_vm_module_t** module) {
+  return module_create(iree_allocator_system(), module);
+}
+
+void print_success() { printf("static_library_run_c passed\n"); }
diff --git a/iree/samples/static_library/static_library_demo.c b/iree/samples/static_library/static_library_demo.c
index 9b97293..6b9e36e 100644
--- a/iree/samples/static_library/static_library_demo.c
+++ b/iree/samples/static_library/static_library_demo.c
@@ -7,18 +7,18 @@
 // A example of static library loading in IREE. See the README.md for more info.
 // Note: this demo requires artifacts from iree-translate before it will run.
 
-#include <stdio.h>
-
 #include "iree/hal/local/loaders/static_library_loader.h"
 #include "iree/hal/local/sync_device.h"
 #include "iree/modules/hal/module.h"
 #include "iree/runtime/api.h"
-#include "iree/samples/static_library/simple_mul_c.h"
-#include "iree/task/api.h"
-#include "iree/vm/bytecode_module.h"
 
-// Compiled static library module here to avoid IO:
-#include "iree/samples/static_library/simple_mul.h"
+extern const iree_hal_executable_library_header_t**
+simple_mul_dispatch_0_library_query(
+    iree_hal_executable_library_version_t max_version, void* reserved);
+// A function to create the bytecode or C module.
+extern iree_status_t create_module(iree_vm_module_t** module);
+
+extern void print_success();
 
 // A function to create the HAL device from the different backend targets.
 // The HAL device is returned based on the implementation, and it must be
@@ -87,18 +87,14 @@
   }
 
   // Load bytecode module from the embedded data. Append to the session.
-  const struct iree_file_toc_t* module_file_toc =
-      iree_samples_static_library_simple_mul_create();
-  iree_const_byte_span_t module_data =
-      iree_make_const_byte_span(module_file_toc->data, module_file_toc->size);
-  iree_vm_module_t* bytecode_module = NULL;
+  iree_vm_module_t* module = NULL;
+
   if (iree_status_is_ok(status)) {
-    status = iree_vm_bytecode_module_create(module_data, iree_allocator_null(),
-                                            iree_allocator_system(),
-                                            &bytecode_module);
+    status = create_module(&module);
   }
+
   if (iree_status_is_ok(status)) {
-    status = iree_runtime_session_append_module(session, bytecode_module);
+    status = iree_runtime_session_append_module(session, module);
   }
 
   // Lookup the entry point function call.
@@ -197,7 +193,7 @@
   iree_hal_device_release(device);
   iree_runtime_session_release(session);
   iree_runtime_instance_release(instance);
-  iree_vm_module_release(bytecode_module);
+  iree_vm_module_release(module);
 
   return status;
 }
@@ -209,6 +205,6 @@
     iree_status_free(result);
     return -1;
   }
-  printf("static_library_run passed\n");
+  print_success();
   return 0;
 }
diff --git a/iree/samples/static_library/static_library_demo_c_test.txt b/iree/samples/static_library/static_library_demo_c_test.txt
new file mode 100644
index 0000000..65f47b1
--- /dev/null
+++ b/iree/samples/static_library/static_library_demo_c_test.txt
@@ -0,0 +1,2 @@
+// RUN: (static_library_demo_c) | IreeFileCheck %s
+// CHECK-LABEL: static_library_run_c passed
diff --git a/iree/samples/static_library/static_library_demo_test.txt b/iree/samples/static_library/static_library_demo_test.txt
index a13d0a4..d7f764f 100644
--- a/iree/samples/static_library/static_library_demo_test.txt
+++ b/iree/samples/static_library/static_library_demo_test.txt
@@ -1,2 +1,2 @@
 // RUN: (static_library_demo) | IreeFileCheck %s
-// CHECK-LABEL: static_library_run passed
+// CHECK-LABEL: static_library_run_bytecode passed
diff --git a/iree/vm/shims_emitc.h b/iree/vm/shims_emitc.h
index 5b19a64..985818c 100644
--- a/iree/vm/shims_emitc.h
+++ b/iree/vm/shims_emitc.h
@@ -7,100 +7,309 @@
 #ifndef IREE_VM_SHIMS_EMITC_H_
 #define IREE_VM_SHIMS_EMITC_H_
 
+#include "iree/base/attributes.h"
 #include "iree/vm/module.h"
+#include "iree/vm/shims.h"
 #include "iree/vm/stack.h"
 
-// see Calling convetion in module.h
-// Variadic arguments are not supported
+// see calling convention in module.h
 
-// 0v_v
-typedef iree_status_t (*call_0v_v_t)(iree_vm_stack_t* stack, void* module_ptr,
-                                     void* module_state);
+#define CONCAT(a, b) CONCAT_IMPL(a, b)
+#define CONCAT_IMPL(a, b) a##b
+#define TUPLE_UNPACK(...) TUPLE_UNPACK_IMPL __VA_ARGS__
+#define TUPLE_UNPACK_IMPL(...) __VA_ARGS__
 
-static iree_status_t call_0v_v_shim(iree_vm_stack_t* stack,
-                                    const iree_vm_function_call_t* call,
-                                    call_0v_v_t target_fn, void* module,
-                                    void* module_state,
-                                    iree_vm_execution_result_t* out_result) {
-  return target_fn(stack, module, module_state);
-}
+#define NTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...) N
+#define NARGS(...) NTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1)
 
-// 0v_i
-typedef iree_status_t (*call_0v_i_t)(iree_vm_stack_t* stack, void* module_ptr,
-                                     void* module_state, int32_t* res0);
+#define INC(n) INC_##n
+#define INC_0 1
+#define INC_1 2
+#define INC_2 3
+#define INC_3 4
+#define INC_4 5
+#define INC_5 6
+#define INC_6 7
+#define INC_7 8
+#define INC_8 9
+#define INC_9 10
+#define INC_10 11
 
-static iree_status_t call_0v_i_shim(iree_vm_stack_t* stack,
-                                    const iree_vm_function_call_t* call,
-                                    call_0v_i_t target_fn, void* module,
-                                    void* module_state,
-                                    iree_vm_execution_result_t* out_result) {
-  typedef struct {
-    int32_t ret0;
-  } results_t;
+#define JOIN(...) JOIN_IMPL(__VA_ARGS__)
+#define JOIN_IMPL(...) CONCAT(JOIN_, NARGS(__VA_ARGS__))(__VA_ARGS__)
+#define JOIN_0(...)
+#define JOIN_1(a, ...) CONCAT(a, JOIN_0(__VA_ARGS__))
+#define JOIN_2(a, ...) CONCAT(a, JOIN_1(__VA_ARGS__))
+#define JOIN_3(a, ...) CONCAT(a, JOIN_2(__VA_ARGS__))
+#define JOIN_4(a, ...) CONCAT(a, JOIN_3(__VA_ARGS__))
+#define JOIN_5(a, ...) CONCAT(a, JOIN_4(__VA_ARGS__))
+#define JOIN_6(a, ...) CONCAT(a, JOIN_5(__VA_ARGS__))
+#define JOIN_7(a, ...) CONCAT(a, JOIN_6(__VA_ARGS__))
+#define JOIN_8(a, ...) CONCAT(a, JOIN_7(__VA_ARGS__))
+#define JOIN_9(a, ...) CONCAT(a, JOIN_8(__VA_ARGS__))
+#define JOIN_10(a, ...) CONCAT(a, JOIN_9(__VA_ARGS__))
 
-  results_t* results = (results_t*)call->results.data;
+// call f(idx, arg) for each argument
+#define FOR_EACH(f, ...) FOR_EACH_IMPL(f, NARGS(__VA_ARGS__), __VA_ARGS__)
+#define FOR_EACH_IMPL(f, n, ...) CONCAT(FOR_EACH_, n)(f, 0, __VA_ARGS__)
+#define FOR_EACH_0(f, ...)
+#define FOR_EACH_1(f, i, a, ...) f(i, a) FOR_EACH_0(f, INC(i), __VA_ARGS__)
+#define FOR_EACH_2(f, i, a, ...) f(i, a) FOR_EACH_1(f, INC(i), __VA_ARGS__)
+#define FOR_EACH_3(f, i, a, ...) f(i, a) FOR_EACH_2(f, INC(i), __VA_ARGS__)
+#define FOR_EACH_4(f, i, a, ...) f(i, a) FOR_EACH_3(f, INC(i), __VA_ARGS__)
+#define FOR_EACH_5(f, i, a, ...) f(i, a) FOR_EACH_4(f, INC(i), __VA_ARGS__)
+#define FOR_EACH_6(f, i, a, ...) f(i, a) FOR_EACH_5(f, INC(i), __VA_ARGS__)
+#define FOR_EACH_7(f, i, a, ...) f(i, a) FOR_EACH_6(f, INC(i), __VA_ARGS__)
+#define FOR_EACH_8(f, i, a, ...) f(i, a) FOR_EACH_7(f, INC(i), __VA_ARGS__)
+#define FOR_EACH_9(f, i, a, ...) f(i, a) FOR_EACH_8(f, INC(i), __VA_ARGS__)
+#define FOR_EACH_10(f, i, a, ...) f(i, a) FOR_EACH_9(f, INC(i), __VA_ARGS__)
 
-  return target_fn(stack, module, module_state, &results->ret0);
-}
+#define TYPE_JOIN(types) JOIN(TUPLE_UNPACK(types))
 
-// 0i_i
-typedef iree_status_t (*call_0i_i_t)(iree_vm_stack_t* stack, void* module_ptr,
-                                     void* module_state, int32_t arg0,
-                                     int32_t* res0);
+#define EMITC_DEFINE_SHIMS(arg_types, ret_types)                  \
+  EMITC_FIXED_TYPEDEF(arg_types, TYPE_JOIN(arg_types), ret_types, \
+                      TYPE_JOIN(ret_types))                       \
+  EMITC_FIXED_SHIM(arg_types, TYPE_JOIN(arg_types), ret_types,    \
+                   TYPE_JOIN(ret_types))                          \
+  EMITC_FIXED_IMPORT(arg_types, TYPE_JOIN(arg_types), ret_types,  \
+                     TYPE_JOIN(ret_types))
 
-static iree_status_t call_0i_i_shim(iree_vm_stack_t* stack,
-                                    const iree_vm_function_call_t* call,
-                                    call_0i_i_t target_fn, void* module,
-                                    void* module_state,
-                                    iree_vm_execution_result_t* out_result) {
-  typedef struct {
-    int32_t arg0;
-  } args_t;
-  typedef struct {
-    int32_t ret0;
-  } results_t;
+#define EMITC_FIXED_TYPEDEF(arg_types, arg_types_string, ret_types, \
+                            ret_types_string)                       \
+  EMITC_FIXED_TYPEDEF_IMPL(arg_types_string, ret_types_string,      \
+                           INPUT_PARAMETERS(arg_types),             \
+                           OUTPUT_PARAMETERS(ret_types))
 
-  const args_t* args = (const args_t*)call->arguments.data;
-  results_t* results = (results_t*)call->results.data;
+#define EMITC_FIXED_SHIM(arg_types, arg_types_string, ret_types, \
+                         ret_types_string)                       \
+  EMITC_FIXED_SHIM_IMPL(arg_types_string, ret_types_string,      \
+                        INPUT_ARGUMENTS(arg_types),              \
+                        OUTPUT_ARGUMENTS(ret_types))
 
-  return target_fn(stack, module, module_state, args->arg0, &results->ret0);
-}
+#define EMITC_FIXED_IMPORT(arg_types, arg_types_string, ret_types,     \
+                           ret_types_string)                           \
+  EMITC_FIXED_IMPORT_IMPL(                                             \
+      arg_types_string, ret_types_string, INPUT_PARAMETERS(arg_types), \
+      OUTPUT_PARAMETERS(ret_types), PACK_ARGUMENTS(arg_types),         \
+      UNPACK_RESULTS(ret_types))
 
-static iree_status_t call_0i_i_import(iree_vm_stack_t* stack,
-                                      const iree_vm_function_t* import,
-                                      int32_t arg0, int32_t* out_ret0) {
-  iree_vm_function_call_t call;
-  call.function = *import;
-  call.arguments = iree_make_byte_span(&arg0, sizeof(arg0));
-  call.results = iree_make_byte_span(out_ret0, sizeof(*out_ret0));
+#define EMITC_VLA_IMPORT(non_var_arg_types, var_arg_types, ret_types)        \
+  EMITC_VLA_IMPORT_INDIRECT(non_var_arg_types, TYPE_JOIN(non_var_arg_types), \
+                            var_arg_types, TYPE_JOIN(var_arg_types),         \
+                            ret_types, TYPE_JOIN(ret_types))
+#define EMITC_VLA_IMPORT_INDIRECT(non_var_arg_types, non_var_arg_types_string, \
+                                  var_arg_types, var_arg_types_string,         \
+                                  ret_types, ret_types_string)                 \
+  EMITC_VLA_IMPORT_IMPL(                                                       \
+      non_var_arg_types_string, var_arg_types_string, ret_types_string,        \
+      ARGUMENTS_SIZE(non_var_arg_types), ARGUMENTS_SIZE(var_arg_types),        \
+      PACK_VARARG_ARGUMENTS(non_var_arg_types),                                \
+      PACK_VARARG_ARGUMENTS(var_arg_types), UNPACK_VARARG_RESULTS(ret_types),  \
+      UNPACK_RESULTS(ret_types))
 
-  iree_vm_execution_result_t result;
-  memset(&result, 0, sizeof(result));
-  return import->module->begin_call(import->module, stack, &call, &result);
-}
+#define EMITC_FIXED_TYPEDEF_IMPL(arg_types, ret_types, input_parameters, \
+                                 output_parameters)                      \
+  typedef iree_status_t (*call_0##arg_types##_##ret_types##_t)(          \
+      iree_vm_stack_t * IREE_RESTRICT stack, void* IREE_RESTRICT module, \
+      void* IREE_RESTRICT module_state input_parameters output_parameters);
 
-// 0ii_i
-typedef iree_status_t (*call_0ii_i_t)(iree_vm_stack_t* stack, void* module_ptr,
-                                      void* module_state, int32_t arg0,
-                                      int32_t arg1, int32_t* res0);
+// TODO(simon-camp): We should check the args and rets pointers for NULL, but
+// need to special case type 'v'
+#define EMITC_FIXED_SHIM_IMPL(arg_types, ret_types, input_arguments, \
+                              output_arguments)                      \
+  static iree_status_t call_0##arg_types##_##ret_types##_shim(       \
+      iree_vm_stack_t* IREE_RESTRICT stack,                          \
+      const iree_vm_function_call_t* IREE_RESTRICT call,             \
+      call_0##arg_types##_##ret_types##_t target_fn,                 \
+      void* IREE_RESTRICT module, void* IREE_RESTRICT module_state,  \
+      iree_vm_execution_result_t* IREE_RESTRICT out_result) {        \
+    /*const*/ IREE_VM_ABI_TYPE_NAME(arg_types)* args =               \
+        iree_vm_abi_##arg_types##_checked_deref(call->arguments);    \
+    IREE_VM_ABI_TYPE_NAME(ret_types)* rets =                         \
+        iree_vm_abi_##ret_types##_checked_deref(call->results);      \
+                                                                     \
+    iree_vm_abi_##ret_types##_reset(rets);                           \
+    return target_fn(stack, module,                                  \
+                     module_state input_arguments output_arguments); \
+  }
 
-static iree_status_t call_0ii_i_shim(iree_vm_stack_t* stack,
-                                     const iree_vm_function_call_t* call,
-                                     call_0ii_i_t target_fn, void* module,
-                                     void* module_state,
-                                     iree_vm_execution_result_t* out_result) {
-  typedef struct {
-    int32_t arg0;
-    int32_t arg1;
-  } args_t;
-  typedef struct {
-    int32_t ret0;
-  } results_t;
+#define EMITC_FIXED_IMPORT_IMPL(arg_types, ret_types, input_parameters,    \
+                                output_parameters, pack_arguments,         \
+                                unpack_results)                            \
+  static iree_status_t call_0##arg_types##_##ret_types##_import(           \
+      iree_vm_stack_t* IREE_RESTRICT stack,                                \
+      const iree_vm_function_t* IREE_RESTRICT import input_parameters      \
+          output_parameters) {                                             \
+    IREE_VM_ABI_TYPE_NAME(arg_types) arguments;                            \
+    iree_vm_abi_##arg_types##_reset(&arguments);                           \
+    pack_arguments;                                                        \
+                                                                           \
+    IREE_VM_ABI_TYPE_NAME(ret_types) results;                              \
+    iree_vm_abi_##ret_types##_reset(&results);                             \
+                                                                           \
+    iree_vm_function_call_t call;                                          \
+    call.function = *import;                                               \
+    call.arguments = iree_make_byte_span(&arguments, sizeof(arguments));   \
+    call.results = iree_make_byte_span(&results, sizeof(results));         \
+                                                                           \
+    iree_vm_execution_result_t result;                                     \
+    memset(&result, 0, sizeof(result));                                    \
+                                                                           \
+    iree_status_t status =                                                 \
+        import->module->begin_call(import->module, stack, &call, &result); \
+                                                                           \
+    if (!iree_status_is_ok(status)) {                                      \
+      return status;                                                       \
+    }                                                                      \
+                                                                           \
+    unpack_results;                                                        \
+                                                                           \
+    return status;                                                         \
+  }
 
-  const args_t* args = (const args_t*)call->arguments.data;
-  results_t* results = (results_t*)call->results.data;
-  return target_fn(stack, module, module_state, args->arg0, args->arg1,
-                   &results->ret0);
-}
+#define EMITC_VA_LIST_NAME varargs
+#define EMITC_CALL_ARG_PTR_NAME ptr
+#define EMITC_VLA_IMPORT_IMPL(non_var_arg_types, var_arg_types, ret_types,    \
+                              non_var_arg_size, var_arg_size, pack_args,      \
+                              pack_var_args, define_results, unpack_results)  \
+  static iree_status_t                                                        \
+      call_0##non_var_arg_types##C##var_arg_types##D_##ret_types##_import(    \
+          iree_vm_stack_t* IREE_RESTRICT stack,                               \
+          const iree_vm_function_t* IREE_RESTRICT import, int32_t span_count, \
+          ...) {                                                              \
+    iree_host_size_t total_size =                                             \
+        non_var_arg_size + sizeof(int32_t) + span_count * var_arg_size;       \
+                                                                              \
+    IREE_VM_ABI_TYPE_NAME(ret_types) results;                                 \
+    iree_vm_abi_##ret_types##_reset(&results);                                \
+                                                                              \
+    iree_vm_function_call_t call;                                             \
+    call.function = *import;                                                  \
+    call.arguments.data_length = total_size;                                  \
+    call.arguments.data = (uint8_t*)iree_alloca(call.arguments.data_length);  \
+    call.results = iree_make_byte_span(&results, sizeof(results));            \
+                                                                              \
+    memset(call.arguments.data, 0, call.arguments.data_length);               \
+                                                                              \
+    uint8_t* EMITC_CALL_ARG_PTR_NAME = call.arguments.data;                   \
+    va_list EMITC_VA_LIST_NAME;                                               \
+    va_start(EMITC_VA_LIST_NAME, span_count);                                 \
+                                                                              \
+    pack_args;                                                                \
+    memcpy(EMITC_CALL_ARG_PTR_NAME, &span_count, sizeof(int32_t));            \
+    EMITC_CALL_ARG_PTR_NAME += sizeof(int32_t);                               \
+    for (int32_t i = 0; i < span_count; i++) {                                \
+      pack_var_args                                                           \
+    }                                                                         \
+    define_results;                                                           \
+                                                                              \
+    va_end(EMITC_VA_LIST_NAME);                                               \
+                                                                              \
+    iree_vm_execution_result_t result;                                        \
+    memset(&result, 0, sizeof(result));                                       \
+                                                                              \
+    iree_status_t status =                                                    \
+        import->module->begin_call(import->module, stack, &call, &result);    \
+                                                                              \
+    if (!iree_status_is_ok(status)) {                                         \
+      return status;                                                          \
+    }                                                                         \
+                                                                              \
+    unpack_results;                                                           \
+                                                                              \
+    return status;                                                            \
+  }
+
+#define ARGUMENTS_SIZE(types) (0 FOR_EACH(ARGUMENT_SIZE, TUPLE_UNPACK(types)))
+#define ARGUMENT_SIZE(idx, arg) +ARGUMENT_SIZE_##arg
+#define ARGUMENT_SIZE_i sizeof(int32_t)
+#define ARGUMENT_SIZE_r sizeof(iree_vm_ref_t)
+#define ARGUMENT_SIZE_v 0
+
+#define INPUT_ARGUMENTS(types) FOR_EACH(INPUT_ARGUMENT, TUPLE_UNPACK(types))
+#define INPUT_ARGUMENT(idx, arg) INPUT_ARGUMENT_##arg(idx)
+#define INPUT_ARGUMENT_i(idx) , args->i##idx
+#define INPUT_ARGUMENT_r(idx) , &args->r##idx
+#define INPUT_ARGUMENT_v(idx)
+
+#define OUTPUT_ARGUMENTS(types) FOR_EACH(OUTPUT_ARGUMENT, TUPLE_UNPACK(types))
+#define OUTPUT_ARGUMENT(idx, arg) OUTPUT_ARGUMENT_##arg(idx)
+#define OUTPUT_ARGUMENT_i(idx) , &rets->i##idx
+#define OUTPUT_ARGUMENT_r(idx) , &rets->r##idx
+#define OUTPUT_ARGUMENT_v(idx)
+
+#define INPUT_PARAMETERS(types) FOR_EACH(INPUT_PARAMETER, TUPLE_UNPACK(types))
+#define INPUT_PARAMETER(idx, arg) INPUT_PARAMETER_##arg(idx)
+#define INPUT_PARAMETER_i(idx) , int32_t arg##idx
+#define INPUT_PARAMETER_r(idx) , iree_vm_ref_t* arg##idx
+#define INPUT_PARAMETER_v(idx)
+
+#define OUTPUT_PARAMETERS(types) FOR_EACH(OUTPUT_PARAMETER, TUPLE_UNPACK(types))
+#define OUTPUT_PARAMETER(idx, arg) OUTPUT_PARAMETER_##arg(idx)
+#define OUTPUT_PARAMETER_i(idx) , int32_t* ret##idx
+#define OUTPUT_PARAMETER_r(idx) , iree_vm_ref_t* ret##idx
+#define OUTPUT_PARAMETER_v(idx)
+
+#define PACK_ARGUMENTS(types) FOR_EACH(PACK_ARGUMENT, TUPLE_UNPACK(types))
+#define PACK_ARGUMENT(idx, arg) PACK_ARGUMENT_##arg(idx)
+#define PACK_ARGUMENT_i(idx) arguments.i##idx = arg##idx;
+#define PACK_ARGUMENT_r(idx) iree_vm_ref_assign(arg##idx, &arguments.r##idx);
+#define PACK_ARGUMENT_v(idx)
+
+#define UNPACK_RESULTS(types) FOR_EACH(UNPACK_RESULT, TUPLE_UNPACK(types))
+#define UNPACK_RESULT(idx, arg) UNPACK_RESULT_##arg(idx)
+#define UNPACK_RESULT_i(idx) *ret##idx = results.i##idx;
+#define UNPACK_RESULT_r(idx) iree_vm_ref_move(&results.r##idx, ret##idx);
+#define UNPACK_RESULT_v(idx)
+
+#define PACK_VARARG_ARGUMENTS(types) FOR_EACH(PACK_VARARG, TUPLE_UNPACK(types))
+#define PACK_VARARG(idx, arg) PACK_VARARG_##arg(idx)
+#define PACK_VARARG_i(idx)                                        \
+  PACK_VARARG_i_IMPL(EMITC_CALL_ARG_PTR_NAME, EMITC_VA_LIST_NAME, \
+                     CONCAT(_temp, __COUNTER__))
+#define PACK_VARARG_i_IMPL(dest, varargs, temp) \
+  int32_t temp = va_arg(varargs, int32_t);      \
+  memcpy(dest, &temp, sizeof(int32_t));         \
+  dest += sizeof(int32_t);
+#define PACK_VARARG_r(idx)                                        \
+  PACK_VARARG_r_IMPL(EMITC_CALL_ARG_PTR_NAME, EMITC_VA_LIST_NAME, \
+                     CONCAT(_temp, __COUNTER__))
+#define PACK_VARARG_r_IMPL(dest, varargs, temp)          \
+  iree_vm_ref_t* temp = va_arg(varargs, iree_vm_ref_t*); \
+  iree_vm_ref_assign(temp, (iree_vm_ref_t*)(dest));      \
+  dest += sizeof(iree_vm_ref_t);
+#define PACK_VARARG_v(idx)
+
+#define UNPACK_VARARG_RESULTS(types) \
+  FOR_EACH(UNPACK_VARARG, TUPLE_UNPACK(types))
+#define UNPACK_VARARG(idx, arg) UNPACK_VARARG_##arg(idx)
+#define UNPACK_VARARG_i(idx) \
+  int32_t* ret##idx = va_arg(EMITC_VA_LIST_NAME, int32_t*);
+#define UNPACK_VARARG_r(idx) \
+  iree_vm_ref_t* ret##idx = va_arg(EMITC_VA_LIST_NAME, iree_vm_ref_t*);
+#define UNPACK_VARARG_v(idx)
+
+EMITC_DEFINE_SHIMS((i), (i))
+EMITC_DEFINE_SHIMS((i, i), (i))
+EMITC_DEFINE_SHIMS((r), (r))
+EMITC_DEFINE_SHIMS((r), (v))
+EMITC_DEFINE_SHIMS((r, i, i), (r))
+EMITC_DEFINE_SHIMS((r, i, i, i), (r))
+EMITC_DEFINE_SHIMS((r, i, i, i), (v))
+EMITC_DEFINE_SHIMS((r, r, i, i, i, i), (v))
+EMITC_DEFINE_SHIMS((r, r, r, i, i, i), (v))
+EMITC_DEFINE_SHIMS((r, r), (r))
+EMITC_DEFINE_SHIMS((r, r), (v))
+EMITC_DEFINE_SHIMS((r, r, r), (i, i))
+EMITC_DEFINE_SHIMS((v), (i))
+EMITC_DEFINE_SHIMS((v), (r))
+EMITC_DEFINE_SHIMS((v), (v))
+
+EMITC_VLA_IMPORT((r, i), (i, i, i), (r))
+EMITC_VLA_IMPORT((r, i), (r), (r))
+EMITC_VLA_IMPORT((r, i, i), (i), (r))
+EMITC_VLA_IMPORT((r, r, i), (i, r, i, i), (v))
+EMITC_VLA_IMPORT((r, r, r), (r), (r))
+EMITC_VLA_IMPORT((r, r, i, i), (i), (v))
 
 #endif  // IREE_VM_SHIMS_EMITC_H_
diff --git a/scripts/lint.sh b/scripts/lint.sh
new file mode 100755
index 0000000..669bd77
--- /dev/null
+++ b/scripts/lint.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Runs all the lint checks that we run on GitHub locally. Skips checks if the
+# relevant tool doesn't exist.
+
+# Keep this in sync with .github/workflows/lint.yml
+
+# WARNING: this script *makes changes* to the working directory and the index.
+
+set -uo pipefail
+
+FINAL_RET=0
+LATEST_RET=0
+
+function update_ret() {
+  LATEST_RET="$?"
+  if [[ "${LATEST_RET}" -gt "${FINAL_RET}" ]]; then
+    FINAL_RET="${LATEST_RET}"
+  fi
+}
+
+# Update the exit code after every command
+function enable_update_ret() {
+  trap update_ret DEBUG
+}
+
+function disable_update_ret() {
+  trap - DEBUG
+}
+
+function exists() {
+  command -v "${1}" > /dev/null
+}
+
+
+echo "***** Uncommitted changes *****"
+git add -A
+git diff HEAD --exit-code
+
+if [[ $? -ne 0 ]]; then
+  echo "Found uncomitted changes in working directory. This script requires" \
+        "all changes to be committed. All changes have been added to the" \
+        "index. Please commit or clean all changes and try again."
+  exit 1
+fi
+
+enable_update_ret
+
+echo "***** Bazel -> CMake *****"
+./build_tools/bazel_to_cmake/bazel_to_cmake.py
+./build_tools/bazel_to_cmake/bazel_to_cmake.py --root_dir=integrations/tensorflow/e2e
+git add -A
+git diff HEAD --exit-code
+trap - DEBUG
+
+echo "***** buildifier *****"
+# Don't fail script if condition is false
+disable_update_ret
+if exists buildifier; then
+  enable_update_ret
+  ./scripts/run_buildifier.sh
+  git diff --exit-code
+else
+  enable_update_ret
+  echo "'buildifier' not found. Skipping check"
+fi
+
+echo "***** yapf *****"
+# Don't fail script if condition is false
+disable_update_ret
+if exists yapf > /dev/null; then
+  enable_update_ret
+  git diff -U0 main | ./third_party/format_diff/format_diff.py yapf -i
+else
+  enable_update_ret
+  echo "'yapf' not found. Skipping check"
+fi
+
+echo "***** pytype *****"
+# Don't fail script if condition is false
+disable_update_ret
+if exists pytype; then
+  enable_update_ret
+  ./build_tools/pytype/check_diff.sh
+else
+  enable_update_ret
+  echo "'pytype' not found. Skipping check"
+fi
+
+echo "***** clang-format *****"
+# Don't fail script if condition is false
+disable_update_ret
+if exists git-clang-format; then
+  enable_update_ret
+  git-clang-format --style=file
+  git diff --exit-code
+else
+  enable_update_ret
+  echo "'git-clang-format' not found. Skipping check"
+fi
+
+echo "***** submodules *****"
+./scripts/git/submodule_versions.py check
+
+echo "***** tabs *****"
+./scripts/check_tabs.sh
+
+echo "***** yamllint *****"
+echo "'yamllint' check not yet implemented. Skipping check"
+
+if [[ "${FINAL_RET}" -ne 0 ]]; then
+  echo "Encountered failures. Check error messages and changes to the working" \
+       "directory and git index (which may contain fixes) and try again."
+fi
+
+exit "${FINAL_RET}"
diff --git a/third_party/format_diff/format_diff.py b/third_party/format_diff/format_diff.py
old mode 100644
new mode 100755