Rework includes and directory names for samples. (#9177)

Refactoring after https://github.com/google/iree/pull/8958 shuffled these files around, aiming to remove custom logic in samples that we want users to be able to fork easily.

Paths are reverted from `iree_[sample_name]` back to `[sample_name]`

Includes changed to match:

| | |
| -- | -- |
| C++ includes before | `#include "iree_[sample_name]/[header_name].h"` |
| C++ includes now | `#include "samples/[sample_name]/[header_name].h"` |
| Another option | `#include "header_name.h"` 

The third option would require adding
```
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
```
to `target_include_directories()` (I already do that in the experimental web samples)
diff --git a/samples/simple_embedding/BUILD b/samples/simple_embedding/BUILD
new file mode 100644
index 0000000..0bef44c
--- /dev/null
+++ b/samples/simple_embedding/BUILD
@@ -0,0 +1,338 @@
+# Copyright 2019 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//iree:build_defs.oss.bzl", "iree_cmake_extra_content")
+load("//build_tools/bazel:iree_bytecode_module.bzl", "iree_bytecode_module")
+load("//build_tools/bazel:native_binary.bzl", "native_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_cmake_extra_content(
+    content = """
+if((IREE_HAL_DRIVER_VMVX OR IREE_HAL_DRIVER_VMVX_SYNC) AND
+   (IREE_TARGET_BACKEND_VMVX OR DEFINED IREE_HOST_BINARY_ROOT))
+""",
+    inline = True,
+)
+
+cc_binary(
+    name = "simple_embedding_vmvx_sync",
+    srcs = [
+        "device_vmvx_sync.c",
+        "simple_embedding.c",
+    ],
+    deps = [
+        ":simple_embedding_test_bytecode_module_vmvx_c",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/local",
+        "//runtime/src/iree/hal/local:sync_driver",
+        "//runtime/src/iree/hal/local/loaders:vmvx_module_loader",
+        "//runtime/src/iree/modules/hal",
+        "//runtime/src/iree/vm",
+        "//runtime/src/iree/vm:bytecode_module",
+    ],
+)
+
+iree_bytecode_module(
+    name = "simple_embedding_test_bytecode_module_vmvx",
+    src = "simple_embedding_test.mlir",
+    c_identifier = "iree_samples_simple_embedding_test_module_vmvx",
+    flags = [
+        "--iree-input-type=mhlo",
+        "--iree-mlir-to-vm-bytecode-module",
+        "--iree-hal-target-backends=vmvx",
+    ],
+)
+
+native_test(
+    name = "simple_embedding_vmvx_sync_test",
+    src = ":simple_embedding_vmvx_sync",
+)
+
+iree_cmake_extra_content(
+    content = """
+endif()
+""",
+    inline = True,
+)
+
+iree_cmake_extra_content(
+    content = """
+if((IREE_HAL_DRIVER_DYLIB OR IREE_HAL_DRIVER_DYLIB_SYNC) AND
+   (IREE_TARGET_BACKEND_DYLIB_LLVM_AOT OR DEFINED IREE_HOST_BINARY_ROOT))
+""",
+    inline = True,
+)
+
+cc_binary(
+    name = "simple_embedding_embedded_sync",
+    srcs = [
+        "device_embedded_sync.c",
+        "simple_embedding.c",
+    ],
+    deps = [
+        ":simple_embedding_test_bytecode_module_dylib_arm_32_c",
+        ":simple_embedding_test_bytecode_module_dylib_arm_64_c",
+        ":simple_embedding_test_bytecode_module_dylib_riscv_32_c",
+        ":simple_embedding_test_bytecode_module_dylib_riscv_64_c",
+        ":simple_embedding_test_bytecode_module_dylib_x86_64_c",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/local",
+        "//runtime/src/iree/hal/local:sync_driver",
+        "//runtime/src/iree/hal/local/loaders:embedded_library_loader",
+        "//runtime/src/iree/modules/hal",
+        "//runtime/src/iree/vm",
+        "//runtime/src/iree/vm:bytecode_module",
+    ],
+)
+
+iree_bytecode_module(
+    name = "simple_embedding_test_bytecode_module_dylib_x86_64",
+    src = "simple_embedding_test.mlir",
+    c_identifier = "iree_samples_simple_embedding_test_module_dylib_x86_64",
+    flags = [
+        "--iree-input-type=mhlo",
+        "--iree-mlir-to-vm-bytecode-module",
+        "--iree-hal-target-backends=dylib-llvm-aot",
+        "--iree-llvm-target-triple=x86_64-pc-linux-elf",
+        "--iree-llvm-debug-symbols=false",
+        "--iree-vm-bytecode-module-strip-source-map=true",
+        "--iree-vm-emit-polyglot-zip=false",
+    ],
+)
+
+iree_bytecode_module(
+    name = "simple_embedding_test_bytecode_module_dylib_riscv_32",
+    src = "simple_embedding_test.mlir",
+    c_identifier = "iree_samples_simple_embedding_test_module_dylib_riscv_32",
+    flags = [
+        "--iree-input-type=mhlo",
+        "--iree-mlir-to-vm-bytecode-module",
+        "--iree-hal-target-backends=dylib-llvm-aot",
+        "--iree-llvm-target-triple=riscv32-pc-linux-elf",
+        "--iree-llvm-target-cpu=generic-rv32",
+        "--iree-llvm-target-cpu-features=+m,+f",
+        "--iree-llvm-target-abi=ilp32",
+        "--iree-llvm-debug-symbols=false",
+        "--iree-vm-bytecode-module-strip-source-map=true",
+        "--iree-vm-emit-polyglot-zip=false",
+    ],
+)
+
+iree_bytecode_module(
+    name = "simple_embedding_test_bytecode_module_dylib_riscv_64",
+    src = "simple_embedding_test.mlir",
+    c_identifier = "iree_samples_simple_embedding_test_module_dylib_riscv_64",
+    flags = [
+        "--iree-input-type=mhlo",
+        "--iree-mlir-to-vm-bytecode-module",
+        "--iree-hal-target-backends=dylib-llvm-aot",
+        "--iree-llvm-target-triple=riscv64-pc-linux-elf",
+        "--iree-llvm-target-cpu=generic-rv64",
+        "--iree-llvm-target-cpu-features=+m,+a,+f,+d,+c",
+        "--iree-llvm-target-abi=lp64d",
+        "--iree-llvm-debug-symbols=false",
+        "--iree-vm-bytecode-module-strip-source-map=true",
+        "--iree-vm-emit-polyglot-zip=false",
+    ],
+)
+
+iree_bytecode_module(
+    name = "simple_embedding_test_bytecode_module_dylib_arm_32",
+    src = "simple_embedding_test.mlir",
+    c_identifier = "iree_samples_simple_embedding_test_module_dylib_arm_32",
+    flags = [
+        "--iree-input-type=mhlo",
+        "--iree-mlir-to-vm-bytecode-module",
+        "--iree-hal-target-backends=dylib-llvm-aot",
+        "--iree-llvm-target-triple=armv7a-pc-linux-elf",
+        "--iree-llvm-target-float-abi=hard",
+        "--iree-llvm-debug-symbols=false",
+        "--iree-vm-bytecode-module-strip-source-map=true",
+        "--iree-vm-emit-polyglot-zip=false",
+    ],
+)
+
+iree_bytecode_module(
+    name = "simple_embedding_test_bytecode_module_dylib_arm_64",
+    src = "simple_embedding_test.mlir",
+    c_identifier = "iree_samples_simple_embedding_test_module_dylib_arm_64",
+    flags = [
+        "--iree-input-type=mhlo",
+        "--iree-mlir-to-vm-bytecode-module",
+        "--iree-hal-target-backends=dylib-llvm-aot",
+        "--iree-llvm-target-triple=aarch64-pc-linux-elf",
+        "--iree-llvm-debug-symbols=false",
+        "--iree-vm-bytecode-module-strip-source-map=true",
+        "--iree-vm-emit-polyglot-zip=false",
+    ],
+)
+
+native_test(
+    name = "simple_embedding_embedded_sync_test",
+    src = ":simple_embedding_embedded_sync",
+)
+
+iree_cmake_extra_content(
+    content = """
+if(IREE_HAL_DRIVER_DYLIB)
+""",
+    inline = True,
+)
+
+cc_binary(
+    name = "simple_embedding_dylib",
+    srcs = [
+        "device_dylib.c",
+        "simple_embedding.c",
+    ],
+    deps = [
+        ":simple_embedding_test_bytecode_module_dylib_arm_64_c",
+        ":simple_embedding_test_bytecode_module_dylib_riscv_64_c",
+        ":simple_embedding_test_bytecode_module_dylib_x86_64_c",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/local",
+        "//runtime/src/iree/hal/local:task_driver",
+        "//runtime/src/iree/hal/local/loaders:embedded_library_loader",
+        "//runtime/src/iree/modules/hal",
+        "//runtime/src/iree/task:api",
+        "//runtime/src/iree/vm",
+        "//runtime/src/iree/vm:bytecode_module",
+    ],
+)
+
+native_test(
+    name = "simple_embedding_dylib_test",
+    src = ":simple_embedding_dylib",
+)
+
+iree_cmake_extra_content(
+    content = """
+endif()
+""",
+    inline = True,
+)
+
+iree_cmake_extra_content(
+    content = """
+endif()
+
+if(IREE_HAL_DRIVER_VULKAN AND
+   (IREE_TARGET_BACKEND_VULKAN_SPIRV OR DEFINED IREE_HOST_BINARY_ROOT))
+""",
+    inline = True,
+)
+
+cc_binary(
+    name = "simple_embedding_vulkan",
+    srcs = [
+        "device_vulkan.c",
+        "simple_embedding.c",
+    ],
+    deps = [
+        ":simple_embedding_test_bytecode_module_vulkan_c",
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/hal/vulkan/registration",
+        "//runtime/src/iree/modules/hal",
+        "//runtime/src/iree/vm",
+        "//runtime/src/iree/vm:bytecode_module",
+    ],
+)
+
+iree_bytecode_module(
+    name = "simple_embedding_test_bytecode_module_vulkan",
+    src = "simple_embedding_test.mlir",
+    c_identifier = "iree_samples_simple_embedding_test_module_vulkan",
+    flags = [
+        "--iree-input-type=mhlo",
+        "--iree-mlir-to-vm-bytecode-module",
+        "--iree-hal-target-backends=vulkan-spirv",
+        "--iree-llvm-debug-symbols=false",
+    ],
+)
+
+native_test(
+    name = "simple_embedding_vulkan_test",
+    src = ":simple_embedding_vulkan",
+)
+
+iree_cmake_extra_content(
+    content = """
+endif()
+""",
+    inline = True,
+)
+
+# Disabled because CUDA is not universally available and Bazel does not
+# support configurability in a reasonable or useful way.
+# iree_cmake_extra_content(
+#     content = """
+# if(IREE_HAL_DRIVER_CUDA AND
+#    (IREE_TARGET_BACKEND_CUDA OR DEFINED IREE_HOST_BINARY_ROOT))
+# """,
+#     inline = True,
+# )
+
+# cc_binary(
+#     name = "simple_embedding_cuda",
+#     srcs = [
+#         "device_cuda.c",
+#         "simple_embedding.c",
+#     ],
+#     deps = [
+#         ":simple_embedding_test_bytecode_module_cuda_c",
+#         "//runtime/src/iree/base",
+#         "//runtime/src/iree/hal",
+#         "//runtime/src/iree/hal/cuda/registration",
+#         "//runtime/src/iree/modules/hal",
+#         "//runtime/src/iree/vm",
+#         "//runtime/src/iree/vm:bytecode_module",
+#     ],
+# )
+
+# iree_bytecode_module(
+#     name = "simple_embedding_test_bytecode_module_cuda",
+#     src = "simple_embedding_test.mlir",
+#     c_identifier = "iree_samples_simple_embedding_test_module_cuda",
+#     flags = [
+#         "--iree-input-type=mhlo",
+#         "--iree-mlir-to-vm-bytecode-module",
+#         "--iree-hal-target-backends=cuda",
+#         "--iree-llvm-debug-symbols=false",
+#     ],
+# )
+
+# # native_test(
+# #     name = "simple_embedding_cuda_test",
+# #     src = ":simple_embedding_cuda",
+# #     tags = [
+# #         "driver=cuda",
+# #     ],
+# # )
+
+# iree_cmake_extra_content(
+#     content = """
+# iree_native_test(
+#   NAME
+#     "simple_embedding_cuda_test"
+#   LABELS
+#     "driver=cuda"
+#   SRC
+#     ::simple_embedding_cuda
+# )
+
+# endif()
+# """,
+#     inline = True,
+# )
diff --git a/samples/simple_embedding/CMakeLists.txt b/samples/simple_embedding/CMakeLists.txt
new file mode 100644
index 0000000..c992e06
--- /dev/null
+++ b/samples/simple_embedding/CMakeLists.txt
@@ -0,0 +1,263 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# samples/simple_embedding/BUILD                                               #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+if((IREE_HAL_DRIVER_VMVX OR IREE_HAL_DRIVER_VMVX_SYNC) AND
+   (IREE_TARGET_BACKEND_VMVX OR DEFINED IREE_HOST_BINARY_ROOT))
+
+iree_cc_binary(
+  NAME
+    simple_embedding_vmvx_sync
+  SRCS
+    "device_vmvx_sync.c"
+    "simple_embedding.c"
+  DEPS
+    ::simple_embedding_test_bytecode_module_vmvx_c
+    iree::base
+    iree::hal
+    iree::hal::local
+    iree::hal::local::loaders::vmvx_module_loader
+    iree::hal::local::sync_driver
+    iree::modules::hal
+    iree::vm
+    iree::vm::bytecode_module
+)
+
+iree_bytecode_module(
+  NAME
+    simple_embedding_test_bytecode_module_vmvx
+  SRC
+    "simple_embedding_test.mlir"
+  C_IDENTIFIER
+    "iree_samples_simple_embedding_test_module_vmvx"
+  FLAGS
+    "--iree-input-type=mhlo"
+    "--iree-mlir-to-vm-bytecode-module"
+    "--iree-hal-target-backends=vmvx"
+  PUBLIC
+)
+
+iree_native_test(
+  NAME
+    "simple_embedding_vmvx_sync_test"
+  SRC
+    ::simple_embedding_vmvx_sync
+)
+
+endif()
+
+if((IREE_HAL_DRIVER_DYLIB OR IREE_HAL_DRIVER_DYLIB_SYNC) AND
+   (IREE_TARGET_BACKEND_DYLIB_LLVM_AOT OR DEFINED IREE_HOST_BINARY_ROOT))
+
+iree_cc_binary(
+  NAME
+    simple_embedding_embedded_sync
+  SRCS
+    "device_embedded_sync.c"
+    "simple_embedding.c"
+  DEPS
+    ::simple_embedding_test_bytecode_module_dylib_arm_32_c
+    ::simple_embedding_test_bytecode_module_dylib_arm_64_c
+    ::simple_embedding_test_bytecode_module_dylib_riscv_32_c
+    ::simple_embedding_test_bytecode_module_dylib_riscv_64_c
+    ::simple_embedding_test_bytecode_module_dylib_x86_64_c
+    iree::base
+    iree::hal
+    iree::hal::local
+    iree::hal::local::loaders::embedded_library_loader
+    iree::hal::local::sync_driver
+    iree::modules::hal
+    iree::vm
+    iree::vm::bytecode_module
+)
+
+iree_bytecode_module(
+  NAME
+    simple_embedding_test_bytecode_module_dylib_x86_64
+  SRC
+    "simple_embedding_test.mlir"
+  C_IDENTIFIER
+    "iree_samples_simple_embedding_test_module_dylib_x86_64"
+  FLAGS
+    "--iree-input-type=mhlo"
+    "--iree-mlir-to-vm-bytecode-module"
+    "--iree-hal-target-backends=dylib-llvm-aot"
+    "--iree-llvm-target-triple=x86_64-pc-linux-elf"
+    "--iree-llvm-debug-symbols=false"
+    "--iree-vm-bytecode-module-strip-source-map=true"
+    "--iree-vm-emit-polyglot-zip=false"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    simple_embedding_test_bytecode_module_dylib_riscv_32
+  SRC
+    "simple_embedding_test.mlir"
+  C_IDENTIFIER
+    "iree_samples_simple_embedding_test_module_dylib_riscv_32"
+  FLAGS
+    "--iree-input-type=mhlo"
+    "--iree-mlir-to-vm-bytecode-module"
+    "--iree-hal-target-backends=dylib-llvm-aot"
+    "--iree-llvm-target-triple=riscv32-pc-linux-elf"
+    "--iree-llvm-target-cpu=generic-rv32"
+    "--iree-llvm-target-cpu-features=+m,+f"
+    "--iree-llvm-target-abi=ilp32"
+    "--iree-llvm-debug-symbols=false"
+    "--iree-vm-bytecode-module-strip-source-map=true"
+    "--iree-vm-emit-polyglot-zip=false"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    simple_embedding_test_bytecode_module_dylib_riscv_64
+  SRC
+    "simple_embedding_test.mlir"
+  C_IDENTIFIER
+    "iree_samples_simple_embedding_test_module_dylib_riscv_64"
+  FLAGS
+    "--iree-input-type=mhlo"
+    "--iree-mlir-to-vm-bytecode-module"
+    "--iree-hal-target-backends=dylib-llvm-aot"
+    "--iree-llvm-target-triple=riscv64-pc-linux-elf"
+    "--iree-llvm-target-cpu=generic-rv64"
+    "--iree-llvm-target-cpu-features=+m,+a,+f,+d,+c"
+    "--iree-llvm-target-abi=lp64d"
+    "--iree-llvm-debug-symbols=false"
+    "--iree-vm-bytecode-module-strip-source-map=true"
+    "--iree-vm-emit-polyglot-zip=false"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    simple_embedding_test_bytecode_module_dylib_arm_32
+  SRC
+    "simple_embedding_test.mlir"
+  C_IDENTIFIER
+    "iree_samples_simple_embedding_test_module_dylib_arm_32"
+  FLAGS
+    "--iree-input-type=mhlo"
+    "--iree-mlir-to-vm-bytecode-module"
+    "--iree-hal-target-backends=dylib-llvm-aot"
+    "--iree-llvm-target-triple=armv7a-pc-linux-elf"
+    "--iree-llvm-target-float-abi=hard"
+    "--iree-llvm-debug-symbols=false"
+    "--iree-vm-bytecode-module-strip-source-map=true"
+    "--iree-vm-emit-polyglot-zip=false"
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    simple_embedding_test_bytecode_module_dylib_arm_64
+  SRC
+    "simple_embedding_test.mlir"
+  C_IDENTIFIER
+    "iree_samples_simple_embedding_test_module_dylib_arm_64"
+  FLAGS
+    "--iree-input-type=mhlo"
+    "--iree-mlir-to-vm-bytecode-module"
+    "--iree-hal-target-backends=dylib-llvm-aot"
+    "--iree-llvm-target-triple=aarch64-pc-linux-elf"
+    "--iree-llvm-debug-symbols=false"
+    "--iree-vm-bytecode-module-strip-source-map=true"
+    "--iree-vm-emit-polyglot-zip=false"
+  PUBLIC
+)
+
+iree_native_test(
+  NAME
+    "simple_embedding_embedded_sync_test"
+  SRC
+    ::simple_embedding_embedded_sync
+)
+
+if(IREE_HAL_DRIVER_DYLIB)
+
+iree_cc_binary(
+  NAME
+    simple_embedding_dylib
+  SRCS
+    "device_dylib.c"
+    "simple_embedding.c"
+  DEPS
+    ::simple_embedding_test_bytecode_module_dylib_arm_64_c
+    ::simple_embedding_test_bytecode_module_dylib_riscv_64_c
+    ::simple_embedding_test_bytecode_module_dylib_x86_64_c
+    iree::base
+    iree::hal
+    iree::hal::local
+    iree::hal::local::loaders::embedded_library_loader
+    iree::hal::local::task_driver
+    iree::modules::hal
+    iree::task::api
+    iree::vm
+    iree::vm::bytecode_module
+)
+
+iree_native_test(
+  NAME
+    "simple_embedding_dylib_test"
+  SRC
+    ::simple_embedding_dylib
+)
+
+endif()
+
+endif()
+
+if(IREE_HAL_DRIVER_VULKAN AND
+   (IREE_TARGET_BACKEND_VULKAN_SPIRV OR DEFINED IREE_HOST_BINARY_ROOT))
+
+iree_cc_binary(
+  NAME
+    simple_embedding_vulkan
+  SRCS
+    "device_vulkan.c"
+    "simple_embedding.c"
+  DEPS
+    ::simple_embedding_test_bytecode_module_vulkan_c
+    iree::base
+    iree::hal
+    iree::hal::vulkan::registration
+    iree::modules::hal
+    iree::vm
+    iree::vm::bytecode_module
+)
+
+iree_bytecode_module(
+  NAME
+    simple_embedding_test_bytecode_module_vulkan
+  SRC
+    "simple_embedding_test.mlir"
+  C_IDENTIFIER
+    "iree_samples_simple_embedding_test_module_vulkan"
+  FLAGS
+    "--iree-input-type=mhlo"
+    "--iree-mlir-to-vm-bytecode-module"
+    "--iree-hal-target-backends=vulkan-spirv"
+    "--iree-llvm-debug-symbols=false"
+  PUBLIC
+)
+
+iree_native_test(
+  NAME
+    "simple_embedding_vulkan_test"
+  SRC
+    ::simple_embedding_vulkan
+)
+
+endif()
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/samples/simple_embedding/README.md b/samples/simple_embedding/README.md
new file mode 100644
index 0000000..bcfd7b3
--- /dev/null
+++ b/samples/simple_embedding/README.md
@@ -0,0 +1,108 @@
+# "Simple Embedding" sample
+
+This sample shows how to run a simple pointwise array multiplication bytecode
+module on various HAL device targets with the minimum runtime overhead. Some of
+these devices are compatible with bare-metal system without threading or file IO
+supports.
+
+# Background
+
+The main bytecode testing tool
+[iree-run-module](https://github.com/google/iree/tree/main/iree/tools/iree-run-module-main.cc)
+requires a proper operating system support to set up the runtime environment to
+execute an IREE bytecode module. For embedded systems, the support such as file
+system or multi-thread asynchronous control may not be available. This sample
+demonstrates how to setup the simplest framework to load and run the IREE
+bytecode with various target backends.
+
+# Build instructions
+
+## CMake (native and cross compilation)
+
+Set up the CMake configuration with `-DIREE_BUILD_SAMPLES=ON` (default on)
+
+Then run
+```sh
+cmake --build <build dir> --target iree/samples/simple_embedding/all
+```
+
+## Bazel (host only)
+```sh
+bazel build iree/samples/simple_embedding:all
+```
+
+The resulting executables are listed as `simple_embedding_<HAL devices>`.
+
+# Code structure
+
+The sample consists of three parts:
+
+## simple_embedding_test.mlir
+The simple pointwise array multiplication op with the entry function called
+`simple_mul`, two <4xf32> inputs, and one <4xf32> output. The ML bytecode
+modules are automatically generated during the build time with the targed HAL
+device configurations from the host compiler `iree-tranlate`.
+
+## simple_embedding.c
+
+The main function of the sample has the following steps:
+1. Create a VM instance.
+2. Create a HAL module based on the target device (see the next section).
+3. Load the bytecode module of the ML workload.
+4. Asssociate the HAL module with the bytecode module in the VM context.
+5. Prepare the function entry point and inputs.
+6. Invoke function.
+7. Retrieve function output.
+
+## device_*.c
+
+The HAL device for different target backends. The device is a `module_loader` +
+`executor` combination. For example,
+[device_embedded_sync.c](https://github.com/google/iree/blob/main/iree/samples/simple_embedding/device_embedded_sync.c)
+uses the embedded library loader and the synchronous executor:
+```c
+iree_hal_sync_device_params_t params;
+iree_hal_sync_device_params_initialize(&params);
+iree_hal_executable_loader_t* loader = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_embedded_library_loader_create(
+      iree_hal_executable_import_provider_null(), iree_allocator_system(),
+      &loader));
+
+iree_string_view_t identifier = iree_make_cstring_view("dylib");
+
+iree_status_t status =
+    iree_hal_sync_device_create(identifier, &params, /*loader_count=*/1,
+                                &loader, iree_allocator_system(), device);
+```
+
+Whereas for
+[device_dylib.c](https://github.com/google/iree/blob/main/iree/samples/simple_embedding/device_dylib.c),
+the executor is replaced with the multi-thread ready asynchronous task executor:
+```c
+...
+iree_task_executor_t* executor = NULL;
+iree_status_t status =
+    iree_task_executor_create_from_flags(iree_allocator_system(), &executor);
+
+iree_string_view_t identifier = iree_make_cstring_view("dylib");
+if (iree_status_is_ok(status)) {
+  // Create the device.
+  status = iree_hal_task_device_create(identifier, &params, executor,
+                                        /*loader_count=*/1, &loader,
+                                        iree_allocator_system(), device);
+```
+An example that utilizes a higher-level driver registry is in
+[device_vulkan.c](https://github.com/google/iree/blob/main/iree/samples/simple_embedding/device_vulkan.c)
+
+### Load device-specific bytecode module
+To avoid the file IO, the bytecode module is converted into a data stream
+(`module_data`) that's embedded in the executable. The same strategy can be
+applied to build applications for the embedded systems without a proper file IO.
+
+# Generic platform support
+Some of the devices in this sample support a generic platform (or the
+machine mode without an operating system). For example, `device_vmvx_sync`
+should support any architecture that IREE supports, and `device_embedded_sync`
+should support any architecture that supports `dylib-llvm-aot` codegen target
+backend (may need to add the bytecode module data if it is not already in
+[device_embedded_sync.c](https://github.com/google/iree/blob/main/iree/samples/simple_embedding/device_embedded_sync.c)).
diff --git a/samples/simple_embedding/device_cuda.c b/samples/simple_embedding/device_cuda.c
new file mode 100644
index 0000000..62c233a
--- /dev/null
+++ b/samples/simple_embedding/device_cuda.c
@@ -0,0 +1,45 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// A example of setting up the the cuda driver.
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/cuda/registration/driver_module.h"
+
+// Compiled module embedded here to avoid file IO:
+#include "samples/simple_embedding/simple_embedding_test_bytecode_module_cuda_c.h"
+
+iree_status_t create_sample_device(iree_allocator_t host_allocator,
+                                   iree_hal_device_t** out_device) {
+  // Only register the CUDA HAL driver.
+  IREE_RETURN_IF_ERROR(
+      iree_hal_cuda_driver_module_register(iree_hal_driver_registry_default()));
+
+  // Create the HAL driver from the name.
+  iree_hal_driver_t* driver = NULL;
+  iree_string_view_t identifier = iree_make_cstring_view("cuda");
+  iree_status_t status = iree_hal_driver_registry_try_create_by_name(
+      iree_hal_driver_registry_default(), identifier, host_allocator, &driver);
+
+  // Create the default device (primary GPU).
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_driver_create_default_device(driver, host_allocator,
+                                                   out_device);
+  }
+
+  iree_hal_driver_release(driver);
+  return iree_ok_status();
+}
+
+const iree_const_byte_span_t load_bytecode_module_data() {
+  const struct iree_file_toc_t* module_file_toc =
+      iree_samples_simple_embedding_test_module_cuda_create();
+  return iree_make_const_byte_span(module_file_toc->data,
+                                   module_file_toc->size);
+}
diff --git a/samples/simple_embedding/device_dylib.c b/samples/simple_embedding/device_dylib.c
new file mode 100644
index 0000000..2321437
--- /dev/null
+++ b/samples/simple_embedding/device_dylib.c
@@ -0,0 +1,74 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// A example of setting up the the dylib driver.
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/executable_loader.h"
+#include "iree/hal/local/loaders/embedded_library_loader.h"
+#include "iree/hal/local/task_device.h"
+#include "iree/task/api.h"
+
+// Compiled module embedded here to avoid file IO:
+#include "samples/simple_embedding/simple_embedding_test_bytecode_module_dylib_arm_64_c.h"
+#include "samples/simple_embedding/simple_embedding_test_bytecode_module_dylib_riscv_64_c.h"
+#include "samples/simple_embedding/simple_embedding_test_bytecode_module_dylib_x86_64_c.h"
+
+iree_status_t create_sample_device(iree_allocator_t host_allocator,
+                                   iree_hal_device_t** out_device) {
+  // Set paramters for the device created in the next step.
+  iree_hal_task_device_params_t params;
+  iree_hal_task_device_params_initialize(&params);
+
+  iree_hal_executable_loader_t* loader = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_embedded_library_loader_create(
+      iree_hal_executable_import_provider_null(), host_allocator, &loader));
+
+  iree_task_executor_t* executor = NULL;
+  iree_status_t status =
+      iree_task_executor_create_from_flags(host_allocator, &executor);
+
+  // Use the default host allocator for buffer allocations.
+  iree_string_view_t identifier = iree_make_cstring_view("dylib");
+  iree_hal_allocator_t* device_allocator = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_allocator_create_heap(identifier, host_allocator,
+                                            host_allocator, &device_allocator);
+  }
+
+  // Create the device.
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_task_device_create(identifier, &params, executor,
+                                         /*loader_count=*/1, &loader,
+                                         device_allocator, host_allocator,
+                                         out_device);
+  }
+
+  iree_hal_allocator_release(device_allocator);
+  iree_task_executor_release(executor);
+  iree_hal_executable_loader_release(loader);
+  return status;
+}
+
+const iree_const_byte_span_t load_bytecode_module_data() {
+#if IREE_ARCH_X86_64
+  const struct iree_file_toc_t* module_file_toc =
+      iree_samples_simple_embedding_test_module_dylib_x86_64_create();
+#elif IREE_ARCH_RISCV_64
+  const struct iree_file_toc_t* module_file_toc =
+      iree_samples_simple_embedding_test_module_dylib_riscv_64_create();
+#elif IREE_ARCH_ARM_64
+  const struct iree_file_toc_t* module_file_toc =
+      iree_samples_simple_embedding_test_module_dylib_arm_64_create();
+#else
+#error "Unsupported platform."
+#endif
+  return iree_make_const_byte_span(module_file_toc->data,
+                                   module_file_toc->size);
+}
diff --git a/samples/simple_embedding/device_embedded_sync.c b/samples/simple_embedding/device_embedded_sync.c
new file mode 100644
index 0000000..b4bc118
--- /dev/null
+++ b/samples/simple_embedding/device_embedded_sync.c
@@ -0,0 +1,79 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// A example of setting up the embedded-sync driver.
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/executable_loader.h"
+#include "iree/hal/local/loaders/embedded_library_loader.h"
+#include "iree/hal/local/sync_device.h"
+
+// Compiled module embedded here to avoid file IO:
+#if IREE_ARCH_ARM_32
+#include "samples/simple_embedding/simple_embedding_test_bytecode_module_dylib_arm_32_c.h"
+#elif IREE_ARCH_ARM_64
+#include "samples/simple_embedding/simple_embedding_test_bytecode_module_dylib_arm_64_c.h"
+#elif IREE_ARCH_RISCV_32
+#include "samples/simple_embedding/simple_embedding_test_bytecode_module_dylib_riscv_32_c.h"
+#elif IREE_ARCH_RISCV_64
+#include "samples/simple_embedding/simple_embedding_test_bytecode_module_dylib_riscv_64_c.h"
+#elif IREE_ARCH_X86_64
+#include "samples/simple_embedding/simple_embedding_test_bytecode_module_dylib_x86_64_c.h"
+#endif
+
+iree_status_t create_sample_device(iree_allocator_t host_allocator,
+                                   iree_hal_device_t** out_device) {
+  // Set parameters for the device created in the next step.
+  iree_hal_sync_device_params_t params;
+  iree_hal_sync_device_params_initialize(&params);
+
+  iree_hal_executable_loader_t* loader = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_embedded_library_loader_create(
+      iree_hal_executable_import_provider_null(), host_allocator, &loader));
+
+  // Use the default host allocator for buffer allocations.
+  iree_string_view_t identifier = iree_make_cstring_view("dylib");
+  iree_hal_allocator_t* device_allocator = NULL;
+  iree_status_t status = iree_hal_allocator_create_heap(
+      identifier, host_allocator, host_allocator, &device_allocator);
+
+  // Create the synchronous device and release the loader afterwards.
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_sync_device_create(
+        identifier, &params, /*loader_count=*/1, &loader, device_allocator,
+        host_allocator, out_device);
+  }
+
+  iree_hal_allocator_release(device_allocator);
+  iree_hal_executable_loader_release(loader);
+  return status;
+}
+
+const iree_const_byte_span_t load_bytecode_module_data() {
+#if IREE_ARCH_X86_64
+  const struct iree_file_toc_t* module_file_toc =
+      iree_samples_simple_embedding_test_module_dylib_x86_64_create();
+#elif IREE_ARCH_RISCV_32
+  const struct iree_file_toc_t* module_file_toc =
+      iree_samples_simple_embedding_test_module_dylib_riscv_32_create();
+#elif IREE_ARCH_RISCV_64
+  const struct iree_file_toc_t* module_file_toc =
+      iree_samples_simple_embedding_test_module_dylib_riscv_64_create();
+#elif IREE_ARCH_ARM_32
+  const struct iree_file_toc_t* module_file_toc =
+      iree_samples_simple_embedding_test_module_dylib_arm_32_create();
+#elif IREE_ARCH_ARM_64
+  const struct iree_file_toc_t* module_file_toc =
+      iree_samples_simple_embedding_test_module_dylib_arm_64_create();
+#else
+#error "Unsupported platform."
+#endif
+  return iree_make_const_byte_span(module_file_toc->data,
+                                   module_file_toc->size);
+}
diff --git a/samples/simple_embedding/device_vmvx_sync.c b/samples/simple_embedding/device_vmvx_sync.c
new file mode 100644
index 0000000..0683405
--- /dev/null
+++ b/samples/simple_embedding/device_vmvx_sync.c
@@ -0,0 +1,59 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// A example of setting up the vmvx-sync driver.
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/local/executable_loader.h"
+#include "iree/hal/local/loaders/vmvx_module_loader.h"
+#include "iree/hal/local/sync_device.h"
+
+// Compiled module embedded here to avoid file IO:
+#include "samples/simple_embedding/simple_embedding_test_bytecode_module_vmvx_c.h"
+
+iree_status_t create_sample_device(iree_allocator_t host_allocator,
+                                   iree_hal_device_t** out_device) {
+  // Set parameters for the device created in the next step.
+  iree_hal_sync_device_params_t params;
+  iree_hal_sync_device_params_initialize(&params);
+
+  iree_vm_instance_t* instance = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_instance_create(host_allocator, &instance));
+
+  iree_hal_executable_loader_t* loader = NULL;
+  iree_status_t status =
+      iree_hal_vmvx_module_loader_create(instance, host_allocator, &loader);
+  iree_vm_instance_release(instance);
+
+  // Use the default host allocator for buffer allocations.
+  iree_string_view_t identifier = iree_make_cstring_view("vmvx");
+  iree_hal_allocator_t* device_allocator = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_allocator_create_heap(identifier, host_allocator,
+                                            host_allocator, &device_allocator);
+  }
+
+  if (iree_status_is_ok(status)) {
+    // Create the synchronous device.
+    status = iree_hal_sync_device_create(
+        identifier, &params, /*loader_count=*/1, &loader, device_allocator,
+        host_allocator, out_device);
+  }
+
+  iree_hal_allocator_release(device_allocator);
+  iree_hal_executable_loader_release(loader);
+  return status;
+}
+
+const iree_const_byte_span_t load_bytecode_module_data() {
+  const struct iree_file_toc_t* module_file_toc =
+      iree_samples_simple_embedding_test_module_vmvx_create();
+  return iree_make_const_byte_span(module_file_toc->data,
+                                   module_file_toc->size);
+}
diff --git a/samples/simple_embedding/device_vulkan.c b/samples/simple_embedding/device_vulkan.c
new file mode 100644
index 0000000..dce0b9f
--- /dev/null
+++ b/samples/simple_embedding/device_vulkan.c
@@ -0,0 +1,45 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// A example of setting up the the vulkan driver.
+
+#include <stddef.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/registration/driver_module.h"
+
+// Compiled module embedded here to avoid file IO:
+#include "samples/simple_embedding/simple_embedding_test_bytecode_module_vulkan_c.h"
+
+iree_status_t create_sample_device(iree_allocator_t host_allocator,
+                                   iree_hal_device_t** out_device) {
+  // Only register the Vulkan HAL driver.
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_driver_module_register(
+      iree_hal_driver_registry_default()));
+
+  // Create the HAL driver from the name.
+  iree_hal_driver_t* driver = NULL;
+  iree_string_view_t identifier = iree_make_cstring_view("vulkan");
+  iree_status_t status = iree_hal_driver_registry_try_create_by_name(
+      iree_hal_driver_registry_default(), identifier, host_allocator, &driver);
+
+  // Create the default device (primary GPU).
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_driver_create_default_device(driver, host_allocator,
+                                                   out_device);
+  }
+
+  iree_hal_driver_release(driver);
+  return iree_ok_status();
+}
+
+const iree_const_byte_span_t load_bytecode_module_data() {
+  const struct iree_file_toc_t* module_file_toc =
+      iree_samples_simple_embedding_test_module_vulkan_create();
+  return iree_make_const_byte_span(module_file_toc->data,
+                                   module_file_toc->size);
+}
diff --git a/samples/simple_embedding/simple_embedding.c b/samples/simple_embedding/simple_embedding.c
new file mode 100644
index 0000000..69be569
--- /dev/null
+++ b/samples/simple_embedding/simple_embedding.c
@@ -0,0 +1,168 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// A example of setting up the HAL module to run simple pointwise array
+// multiplication with the device implemented by different backends via
+// create_sample_driver().
+//
+// NOTE: this file does not properly handle error cases and will leak on
+// failure. Applications that are just going to exit()/abort() on failure can
+// probably get away with the same thing but really should prefer not to.
+
+#include <stdio.h>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/modules/hal/module.h"
+#include "iree/vm/api.h"
+#include "iree/vm/bytecode_module.h"
+
+// A function to create the HAL device from the different backend targets.
+// The HAL device is returned based on the implementation, and it must be
+// released by the caller.
+extern iree_status_t create_sample_device(iree_allocator_t host_allocator,
+                                          iree_hal_device_t** out_device);
+
+// A function to load the vm bytecode module from the different backend targets.
+// The bytecode module is generated for the specific backend and platform.
+extern const iree_const_byte_span_t load_bytecode_module_data();
+
+iree_status_t Run() {
+  // TODO(benvanik): move to instance-based registration.
+  IREE_RETURN_IF_ERROR(iree_hal_module_register_types());
+
+  iree_vm_instance_t* instance = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_vm_instance_create(iree_allocator_system(), &instance));
+
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(create_sample_device(iree_allocator_system(), &device),
+                       "create device");
+  iree_vm_module_t* hal_module = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_module_create(device, iree_allocator_system(), &hal_module));
+
+  // Load bytecode module from the embedded data.
+  const iree_const_byte_span_t module_data = load_bytecode_module_data();
+
+  iree_vm_module_t* bytecode_module = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_bytecode_module_create(
+      module_data, iree_allocator_null(), iree_allocator_system(),
+      &bytecode_module));
+
+  // Allocate a context that will hold the module state across invocations.
+  iree_vm_context_t* context = NULL;
+  iree_vm_module_t* modules[] = {hal_module, bytecode_module};
+  IREE_RETURN_IF_ERROR(iree_vm_context_create_with_modules(
+      instance, IREE_VM_CONTEXT_FLAG_NONE, &modules[0], IREE_ARRAYSIZE(modules),
+      iree_allocator_system(), &context));
+  iree_vm_module_release(hal_module);
+  iree_vm_module_release(bytecode_module);
+
+  // Lookup the entry point function.
+  // Note that we use the synchronous variant which operates on pure type/shape
+  // erased buffers.
+  const char kMainFunctionName[] = "module.simple_mul";
+  iree_vm_function_t main_function;
+  IREE_RETURN_IF_ERROR(iree_vm_context_resolve_function(
+      context, iree_make_cstring_view(kMainFunctionName), &main_function));
+
+  // Initial buffer contents for 4 * 2 = 8.
+  const float kFloat4[] = {4.0f, 4.0f, 4.0f, 4.0f};
+  const float kFloat2[] = {2.0f, 2.0f, 2.0f, 2.0f};
+
+  // Allocate buffers in device-local memory so that if the device has an
+  // independent address space they live on the fast side of the fence.
+  iree_hal_dim_t shape[1] = {IREE_ARRAYSIZE(kFloat4)};
+  iree_hal_buffer_view_t* arg0_buffer_view = NULL;
+  iree_hal_buffer_view_t* arg1_buffer_view = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_view_allocate_buffer(
+      iree_hal_device_allocator(device), shape, IREE_ARRAYSIZE(shape),
+      IREE_HAL_ELEMENT_TYPE_FLOAT_32, IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR,
+      (iree_hal_buffer_params_t){
+          .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
+          .usage =
+              IREE_HAL_BUFFER_USAGE_DISPATCH | IREE_HAL_BUFFER_USAGE_TRANSFER,
+      },
+      iree_make_const_byte_span(kFloat4, sizeof(kFloat4)), &arg0_buffer_view));
+  IREE_RETURN_IF_ERROR(iree_hal_buffer_view_allocate_buffer(
+      iree_hal_device_allocator(device), shape, IREE_ARRAYSIZE(shape),
+      IREE_HAL_ELEMENT_TYPE_FLOAT_32, IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR,
+      (iree_hal_buffer_params_t){
+          .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
+          .usage =
+              IREE_HAL_BUFFER_USAGE_DISPATCH | IREE_HAL_BUFFER_USAGE_TRANSFER,
+      },
+      iree_make_const_byte_span(kFloat2, sizeof(kFloat2)), &arg1_buffer_view));
+
+  // Setup call inputs with our buffers.
+  iree_vm_list_t* inputs = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_list_create(
+                           /*element_type=*/NULL,
+                           /*capacity=*/2, iree_allocator_system(), &inputs),
+                       "can't allocate input vm list");
+
+  iree_vm_ref_t arg0_buffer_view_ref =
+      iree_hal_buffer_view_move_ref(arg0_buffer_view);
+  iree_vm_ref_t arg1_buffer_view_ref =
+      iree_hal_buffer_view_move_ref(arg1_buffer_view);
+  IREE_RETURN_IF_ERROR(
+      iree_vm_list_push_ref_move(inputs, &arg0_buffer_view_ref));
+  IREE_RETURN_IF_ERROR(
+      iree_vm_list_push_ref_move(inputs, &arg1_buffer_view_ref));
+
+  // Prepare outputs list to accept the results from the invocation.
+  // The output vm list is allocated statically.
+  iree_vm_list_t* outputs = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_list_create(
+                           /*element_type=*/NULL,
+                           /*capacity=*/1, iree_allocator_system(), &outputs),
+                       "can't allocate output vm list");
+
+  // Synchronously invoke the function.
+  IREE_RETURN_IF_ERROR(iree_vm_invoke(
+      context, main_function, IREE_VM_INVOCATION_FLAG_NONE,
+      /*policy=*/NULL, inputs, outputs, iree_allocator_system()));
+
+  // Get the result buffers from the invocation.
+  iree_hal_buffer_view_t* ret_buffer_view =
+      (iree_hal_buffer_view_t*)iree_vm_list_get_ref_deref(
+          outputs, 0, iree_hal_buffer_view_get_descriptor());
+  if (ret_buffer_view == NULL) {
+    return iree_make_status(IREE_STATUS_NOT_FOUND,
+                            "can't find return buffer view");
+  }
+
+  // Read back the results and ensure we got the right values.
+  float results[] = {0.0f, 0.0f, 0.0f, 0.0f};
+  IREE_RETURN_IF_ERROR(iree_hal_device_transfer_d2h(
+      device, iree_hal_buffer_view_buffer(ret_buffer_view), 0, results,
+      sizeof(results), IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+      iree_infinite_timeout()));
+  for (iree_host_size_t i = 0; i < IREE_ARRAYSIZE(results); ++i) {
+    if (results[i] != 8.0f) {
+      return iree_make_status(IREE_STATUS_UNKNOWN, "result mismatches");
+    }
+  }
+
+  iree_vm_list_release(inputs);
+  iree_vm_list_release(outputs);
+  iree_hal_device_release(device);
+  iree_vm_context_release(context);
+  iree_vm_instance_release(instance);
+  return iree_ok_status();
+}
+
+int main() {
+  const iree_status_t result = Run();
+  int ret = (int)iree_status_code(result);
+  if (!iree_status_is_ok(result)) {
+    iree_status_fprint(stderr, result);
+    iree_status_free(result);
+  }
+  fprintf(stdout, "simple_embedding done\n");
+  return ret;
+}
diff --git a/samples/simple_embedding/simple_embedding_test.mlir b/samples/simple_embedding/simple_embedding_test.mlir
new file mode 100644
index 0000000..6596219
--- /dev/null
+++ b/samples/simple_embedding/simple_embedding_test.mlir
@@ -0,0 +1,5 @@
+func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
+    {
+  %0 = "mhlo.multiply"(%arg0, %arg1) {name = "mul.1"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}