Merge changes from topic "kelvin-isa-doc-formatting"

* changes:
  Wrap word with __ with backticks for md formatting
  Remove stray backtick to preserve md format
  Adjust backticks to preserve markdown format
diff --git a/.bazelrc b/.bazelrc
index 99fc0bd..5dd1c3b 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -6,3 +6,14 @@
 build --incompatible_enable_cc_toolchain_resolution
 
 build:kelvin --platforms=//platforms/riscv32:kelvin
+
+# Test environment variable for kelvin_sim test_runner.sh
+test --test_env=ROOTDIR
+
+# Set preprocessor defines for tflite-micro.
+build --copt=-DTF_LITE_USE_GLOBAL_CMATH_FUNCTIONS
+build --copt=-DTF_LITE_USE_GLOBAL_MIN
+build --copt=-DTF_LITE_USE_GLOBAL_MAX
+build --copt=-DTF_LITE_MCU_DEBUG_LOG
+build --copt=-DTF_LITE_STATIC_MEMORY
+build:opt --copt=-DTF_LITE_STRIP_ERROR_STRINGS
diff --git a/README.md b/README.md
index 6c95545..35f2db5 100644
--- a/README.md
+++ b/README.md
@@ -21,10 +21,12 @@
 * build_tools: Build tool/rules for both Bazel and CMake
 * crt: Kelvin BSP
 * examples: Source code to build Kelvin SW artifacts.
+* host_tools: host tool to generate the intrinsic header and toolchain op files
 * platforms: Crosscompile platform setup for Bazel.
+* tests: Tests to exercise features of the Kelvin core.
+* tflm: Support code and optimized routines for TFLM.
 * third_party: Third party repositories for Bazel.
 * toolchains: Crosscomple toolchain setup for Bazel.
-* host_tools: host tool to generate the intrinsic header and toolchain op files
 
 ## Build the project
 
@@ -37,6 +39,14 @@
 bazel build //...
 ```
 
+To run the unit tests (with the kelvin_sim ISS)
+Make sure the Environment variable `$ROOTDIR` is set to the root directory
+of the local repo (or run `source build/setup.sh`)
+
+```bash
+bazel test //...
+```
+
 ### CMake
 
 ```note
@@ -45,8 +55,12 @@
 
 ## Run the executable
 
+The binaries can be simulated with the kelvin simulator, located at
+`<dir>/sim/kelvin`.
+
 ```note
-TODO: Add kelvin simulator
+sim_kelvin <elf location>
 ```
 
-Load the generated `.bin` binaries to matcha FPGA emulator.
+Load the generated `.bin` binaries to the FPGA emulator/Renode simulator.
+
diff --git a/WORKSPACE b/WORKSPACE
index 1912123..8a677b0 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,8 +1,26 @@
-workspace(name = "kelvin")
+workspace(name = "kelvin_sw")
 
-load("//build_tools/bazel:repos.bzl", "kelvin_repos")
+load("//build_tools/bazel:repos.bzl", "kelvin_repos", "model_repos", "tflm_repos")
+
 kelvin_repos()
 
 # Register Kelvin toolchain
 load("//platforms:registration.bzl", "kelvin_register_toolchain")
+
 kelvin_register_toolchain()
+
+tflm_repos()
+
+load("@tflite-micro//tensorflow:workspace.bzl", "tf_repositories")
+tf_repositories()
+
+load("@rules_python//python:pip.bzl", "pip_parse")
+pip_parse(
+    name = "tflm_pip_deps",
+    requirements_lock = "@tflite-micro//third_party:python_requirements.txt",
+)
+
+load("@tflm_pip_deps//:requirements.bzl", "install_deps")
+install_deps()
+
+model_repos()
diff --git a/build_tools/BUILD b/build_tools/BUILD
new file mode 100644
index 0000000..d53f03c
--- /dev/null
+++ b/build_tools/BUILD
@@ -0,0 +1,5 @@
+package(default_visibility = ["//visibility:public"])
+
+exports_files([
+    "test_runner.sh",
+])
diff --git a/build_tools/bazel/kelvin.bzl b/build_tools/bazel/kelvin.bzl
index f3f39aa..e6fb102 100644
--- a/build_tools/bazel/kelvin.bzl
+++ b/build_tools/bazel/kelvin.bzl
@@ -189,3 +189,54 @@
         srcs = [name],
         output_group = "elf_file",
     )
+
+def kelvin_test(
+        name,
+        size = "small",
+        **kwargs):
+    """A sh_test wrapper for kelvin binaries
+
+    A wrapper to build kelvin_binary and test it against build_tools/test_runner.sh
+
+    Args:
+      name: The name of this rule.
+      size: Test size. Default to small.
+      **kwargs: Agruments that will be forwarded to kelvin_binary
+    """
+
+    kelvin_elf = "{}_elf".format(name)
+    kelvin_binary(
+        name = kelvin_elf,
+        **kwargs
+    )
+
+    native.sh_test(
+        name = name,
+        size = size,
+        srcs = [
+            "//build_tools:test_runner.sh",
+        ],
+        args = [
+            "$(location %s.elf)" % kelvin_elf,
+        ],
+        data = [
+            "{}.elf".format(kelvin_elf),
+        ],
+    )
+
+# From @tflite-micro//tensorflow/lite/micro/build_def.bzl, and paths
+# modified to point to the external repo.
+def generate_cc_arrays(name, src, out, visibility = None, tags = []):
+    native.genrule(
+        name = name,
+        srcs = [
+            src,
+        ],
+        outs = [
+            out,
+        ],
+        tags = tags,
+        cmd = "$(location @tflite-micro//tensorflow/lite/micro/tools:generate_cc_arrays) $@ $<",
+        tools = ["@tflite-micro//tensorflow/lite/micro/tools:generate_cc_arrays"],
+        visibility = visibility,
+    )
diff --git a/build_tools/bazel/repos.bzl b/build_tools/bazel/repos.bzl
index 7e9cfd6..416296e 100644
--- a/build_tools/bazel/repos.bzl
+++ b/build_tools/bazel/repos.bzl
@@ -1,7 +1,6 @@
 """Kelvin dependency repository setup."""
 
 load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
-load("@bazel_tools//tools/build_defs/repo:git.bzl", "new_git_repository")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
 def kelvin_repos():
@@ -25,10 +24,41 @@
     )
 
     # risc-v isa test
-    new_git_repository(
+    http_archive(
         name = "riscv-tests",
-        commit = "d649367a1386609da3d10e9e6d388f98781dd35f",
         build_file = "//third_party/riscv:BUILD.riscv-tests",
-        shallow_since = "1636745372 -0800",
-        remote = "https://spacebeaker.googlesource.com/shodan/3p/riscv/riscv-tests",
+        sha256 = "1c7eb58edd7399b3ad2f9624a2003862cd87a6904237a737f39cd3978bab46a8",
+        urls = ["https://github.com/riscv-software-src/riscv-tests/archive/d4eaa5bd6674b51d3b9b24913713c4638e99cdd9.tar.gz"],
+        strip_prefix = "riscv-tests-d4eaa5bd6674b51d3b9b24913713c4638e99cdd9",
+    )
+
+def tflm_repos():
+    """Setup Tensorflow Lite For Microcontrollers repositories."""
+    # Tensorflow Lite for Microcontrollers
+    native.local_repository(
+        name = "tflite-micro",
+        path = "../../sw/tflite-micro",
+    )
+
+    maybe(
+        http_archive,
+        name = "rules_python",
+        sha256 = "497ca47374f48c8b067d786b512ac10a276211810f4a580178ee9b9ad139323a",
+        strip_prefix = "rules_python-0.16.1",
+        url = "https://github.com/bazelbuild/rules_python/archive/refs/tags/0.16.1.tar.gz",
+    )
+
+    maybe(
+        http_archive,
+        name = "pybind11_bazel",
+        strip_prefix = "pybind11_bazel-faf56fb3df11287f26dbc66fdedf60a2fc2c6631",
+        urls = ["https://github.com/pybind/pybind11_bazel/archive/faf56fb3df11287f26dbc66fdedf60a2fc2c6631.zip"],
+        sha256 = "a185aa68c93b9f62c80fcb3aadc3c83c763854750dc3f38be1dadcb7be223837",
+    )
+
+def model_repos():
+    native.new_local_repository(
+        name = "ml-models",
+        path = "../../ml/ml-models",
+        build_file = "third_party/ml-models/BUILD",
     )
diff --git a/build_tools/test_runner.sh b/build_tools/test_runner.sh
new file mode 100755
index 0000000..822606b
--- /dev/null
+++ b/build_tools/test_runner.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+function print_usage {
+  echo "Usage: test_runner.sh <elf location>"
+}
+
+if [[ $1 == "--help" ]]; then
+  print_usage
+fi
+
+if [[ -z ${ROOTDIR} ]]; then
+  echo "Please run \"source build/setup.sh\" first"
+  exit 1
+fi
+
+if [[ ! -f ${ROOTDIR}/out/kelvin/sim/kelvin_sim ]]; then
+  echo "Please run \"m kelvin_sim\" first"
+  exit 1
+fi
+
+if (( $# != 1 )); then
+  print_usage
+  exit 1
+fi
+
+ELF_FILE=$(realpath $1)
+SIM_OUT=$(${ROOTDIR}/out/kelvin/sim/kelvin_sim "${ELF_FILE}")
+echo "${SIM_OUT}"
+if [[ ! "${SIM_OUT}" == *"Program exits properly"* ]]; then
+  exit 1
+fi
+
diff --git a/crt/CMakeLists.txt b/crt/CMakeLists.txt
new file mode 100644
index 0000000..cefab25
--- /dev/null
+++ b/crt/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright 2023 Google LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+
+enable_language(ASM)
+
+
+include_directories(BEFORE SYSTEM $ENV{ROOTDIR}/sw/kelvin)
+
+# Build Kelvin
+add_library(kelvin INTERFACE)
+add_library(kelvin_intrinsic STATIC)
+target_sources(kelvin_intrinsic
+    PRIVATE
+      crt.S
+      kelvin_start.S
+      kelvin_gloss.cc
+)
+
+target_include_directories(kelvin_intrinsic PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
+
+target_link_libraries(kelvin
+INTERFACE
+  kelvin_intrinsic
+)
+
+target_include_directories(kelvin INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}")
+
+target_link_options(kelvin
+    INTERFACE
+      -Wl,--whole-archive ${CMAKE_CURRENT_BINARY_DIR}/libkelvin_intrinsic.a -Wl,--no-whole-archive,--no-warn-rwx-segments
+)
diff --git a/crt/kelvin.ld b/crt/kelvin.ld
index 08f3542..7935224 100644
--- a/crt/kelvin.ld
+++ b/crt/kelvin.ld
@@ -51,15 +51,15 @@
     *(.text)
     *(.text.*)
     . = ALIGN(4);
-    __etext = .;
+    __text_end__ = .;
   } > TCM
 
   .init.array : ALIGN(4) {
-    __init_array_start = .;
+    __init_array_start__ = .;
     *(.init_array)
     *(.init_array.*)
     . = ALIGN(4);
-    __init_array_end = .;
+    __init_array_end__ = .;
   } > TCM
 
   .rodata : ALIGN(4) {
@@ -86,7 +86,7 @@
     *(.data.*)
     /* Align on 256 width. */
     . = ALIGN(256);
-      __data_end__ = .;
+    __data_end__ = .;
   } > TCM
 
   .bss : ALIGN(4) {
@@ -112,7 +112,6 @@
     . += STACK_SIZE;
     . = ALIGN(64);
     __stack_end__ = .;
-    _stack_end = .;
   } > TCM
 
   .model_output ORIGIN(TCM) + LENGTH(TCM) - OUTPUT_SIZE - __output_header_size__ (NOLOAD) : {
diff --git a/crt/kelvin_gloss.cc b/crt/kelvin_gloss.cc
index c5859f1..9d13823 100644
--- a/crt/kelvin_gloss.cc
+++ b/crt/kelvin_gloss.cc
@@ -19,6 +19,8 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "crt/kelvin.h"
+
 void* __dso_handle = reinterpret_cast<void*>(&__dso_handle);
 
 extern "C" int _close(int file) { return -1; }
@@ -57,10 +59,47 @@
   return -1;
 }
 
-// TODO(hcindyl/lundong): implement printf properly.
-extern "C" int _write(int file, char* ptr, int len) {
-  errno = EBADF;
-  return -1;
+#ifndef LOG_MAX_SZ
+#define LOG_MAX_SZ 256
+#endif
+// TODO(lundong): Handle stdout and stderr separately
+extern "C" int _write(int file, char* buf, int nbytes) {
+  static int _write_line_buffer_len = 0;
+  static char _write_line_buffer[LOG_MAX_SZ];
+
+  if (file != STDOUT_FILENO && file != STDERR_FILENO) {
+    errno = EBADF;
+    return -1;
+  }
+
+  if (nbytes <= 0) {
+    return 0;
+  }
+
+  if (buf == NULL) {
+    errno = EFAULT;
+    return -1;
+  }
+
+  int bytes_read = 0;
+  char c;
+  do {
+    int len = _write_line_buffer_len;
+    c = *(buf++);
+    bytes_read++;
+
+    _write_line_buffer[len++] = c;
+    if (len == LOG_MAX_SZ - 1 || c == '\n') {
+      _write_line_buffer[len] = '\0';
+    }
+    if ((_write_line_buffer[len] == '\0')) {
+      printf("%s", _write_line_buffer);
+      len = 0;
+    }
+    _write_line_buffer_len = len;
+  } while (bytes_read < nbytes);
+
+  return bytes_read;
 }
 
 extern "C" int _open(const char* path, int flags, ...) { return -1; }
diff --git a/crt/kelvin_start.S b/crt/kelvin_start.S
index b162fdc..cd2496f 100644
--- a/crt/kelvin_start.S
+++ b/crt/kelvin_start.S
@@ -15,10 +15,8 @@
         ###############################################
         # Put all scalar registers into a known state #
         ###############################################
-        la   sp, _stack_end
+        la   sp, __stack_end__
         la   gp, _global_pointer
-        la   s0, __heap_start__
-        sw   s0, _heap_ptr, s1
         mv   tp, zero
         mv   t1, zero
         mv   t2, zero
@@ -51,9 +49,13 @@
         la   a1, __bss_end__
         call crt_section_clear
 
+        # Initialize the heap ptr after clearing BSS
+        la   s0, __heap_start__
+        sw   s0, _heap_ptr, s1
+
         # Initialize arrays
-        la   s0, __init_array_start
-        la   s1, __init_array_end
+        la   s0, __init_array_start__
+        la   s1, __init_array_end__
         bgeu s0, s1, init_array_loop_end
 init_array_loop:
         lw   t0, 0(s0)
@@ -62,6 +64,13 @@
         bltu s0, s1, init_array_loop
 init_array_loop_end:
 
+        # Set up sentinel value in _ret
+        # If we see this after an ebreak,
+        # the break source is unlikely to
+        # be a clean return from main.
+        la   t0, _ret
+        li   a0, 0x0badd00d
+        sw   a0, 0(t0)
 
         #############
         # Call main #
@@ -73,6 +82,11 @@
         # Store the application's return value at _ret
         la   t0, _ret
         sw   a0, 0(t0)
+        beqz a0, success
+failure:
+        ebreak
+        j    loop
+success:
         mpause      # Kelvin end of program op
-1:
-        j    1b
+loop:
+        j    loop
diff --git a/crt/log.h b/crt/log.h
new file mode 100644
index 0000000..d404f5b
--- /dev/null
+++ b/crt/log.h
@@ -0,0 +1,31 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Kelvin logging helper header
+
+#ifndef CRT_LOG_H_
+#define CRT_LOG_H_
+
+#include <stdio.h>
+
+#define LOG_MAX_SZ 256
+
+static inline void kelvin_simprint(const char *_string) {
+  __asm__ volatile("flog %0 \n\t" : : "r"(_string));
+}
+
+#define SIMLOG(fmt, ...)                                 \
+  do {                                                   \
+    char tmp_log_msg[LOG_MAX_SZ];                        \
+    snprintf(tmp_log_msg, LOG_MAX_SZ, fmt, __VA_ARGS__); \
+    kelvin_simprint(tmp_log_msg);                        \
+  } while (0)
+
+#define LOG_ERROR(msg, args...) SIMLOG("%s |" msg "\n", "ERROR", ##args)
+#define LOG_WARN(msg, args...) SIMLOG("%s |" msg "\n", "WARN", ##args)
+#define LOG_INFO(msg, args...) SIMLOG("%s |" msg "\n", "INFO", ##args)
+#define LOG_DEBUG(msg, args...) SIMLOG("%s |" msg "\n", "DEBUG", ##args)
+#define LOG_NOISY(msg, args...) SIMLOG("%s |" msg "\n", "NOISY", ##args)
+
+#endif  // CRT_LOG_H_
diff --git a/docs/kelvin_isa.md b/docs/kelvin_isa.md
index 50d9d27..b00f3f5 100644
--- a/docs/kelvin_isa.md
+++ b/docs/kelvin_isa.md
@@ -32,17 +32,17 @@
 31..26 | 25..20 | 19..14 | 13..12 | 11..6 | 5   | 4..2  | 1..0 | form
 :----: | :----: | :----: | :----: | :---: | :-: | :---: | :--: | :--:
 func2  | vs2    | vs1    | sz     | vd    | m   | func1 | 00   | .vv
-func2  | xs2    | vs1    | sz     | vd    | m   | func1 | 10   | .vx
+func2  | [0]xs2 | vs1    | sz     | vd    | m   | func1 | 10   | .vx
 func2  | 000000 | vs1    | sz     | vd    | m   | func1 | 10   | .v
-func2  | xs2    | xs1    | sz     | vd    | m   | 111   | 11   | .xx
-func2  | 000000 | xs1    | sz     | vd    | m   | 111   | 11   | .x
+func2  | [0]xs2 | xs1[0] | sz     | vd    | m   | 111   | 11   | .xx
+func2  | 000000 | xs1[0] | sz     | vd    | m   | 111   | 11   | .x
 
 <br>
 
-31..26 | 25..20 | 19..14 | 13..12 | 11..6 | 5   | 4..3  | 2..0 | form
-:----: | :----: | :----: | :----: | :---: | :-: | :---: | :--: | :--:
-vs3    | vs2    | vs1    | func3  | vd    | m   | func3 | 001  | .vvv
-vs3    | z,xs2  | vs1    | func3  | vd    | m   | func3 | 101  | .vxv
+31..26 | 25..20 | 19..14 | 13..12      | 11..6 | 5   | 4..3       | 2..0 | form
+:----: | :----: | :----: | :--------:  | :---: | :-: | :--------: | :--: | :--:
+vs3    | vs2    | vs1    | func3[3:2]  | vd    | m   | func3[1:0] | 001  | .vvv
+vs3    | [0]xs2 | vs1    | func3[3:2]  | vd    | m   | func3[1:0] | 101  | .vxv
 
 ### Types ".b" ".h" ".w"
 
@@ -96,7 +96,7 @@
 0     | ""
 1     | ".m"
 
-### 2-arg .xx
+### 2-arg .xx (Load / Store)
 
 Instruction | func2     | Notes
 :---------: | :-------: | :--------:
@@ -126,11 +126,21 @@
 post-increment were programmatic behavior then a register where xs2!=x0 would be
 used.
 
-### 1-arg .x
+**NOTE**: Scalar register `xs1` uses the same encoding bitfield as the vector
+register `vs1`, but **HAS ONE BIT PADDED AT LSB**. That is `xs1` has the same
+encoding as the regular RISC-V instructions (bit[19:15]). On the other head,
+`xs2` shares the same encoding bitfield `vs2`, but **HAS ONE BIT PADDED AT MSB**,
+so it is consistent with the regular RISC-V instructions (bit[24:20]).
+
+### 1-arg .x (Load / Store)
 
 Instructions of the format "op.xx vd, xs1, x0" (xs2=x0, the scalar zero
 register) are reduced to the shortened form "op.x vd, xs1".
 
+**NOTE**: Scalar register `xs1` uses the same encoding bitfield as the vector
+register `vs1`, but **HAS ONE BIT PADDED AT LSB**. That is `xs1` has the same
+encoding as the regular RISC-V instructions (bit[19:15]).
+
 ### 0-arg
 
 Instructions of the format "op.xx vd, x0, x0" (xs1=x0, xs2=x0, the scalar zero
@@ -219,7 +229,7 @@
 Instruction | func3 | Notes
 :---------: | :---: | :-----------------------:
 aconv       | 8     | scalar: sign
-adwconv     | 10    | scalar: sign/type/swizzle
+vdwconv     | 10    | scalar: sign/type/swizzle
 
 ### Typeless
 
@@ -1146,18 +1156,18 @@
 
 #### VGE
 
-Integer greater-than comparison.
+Integer greater-than-or-equal comparison.
 
 **Encodings**
 
-vgt.[b,h,w].{u}.vv.{m} vd, vs1, vs2 \
-vgt.[b,h,w].{u}.vx.{m} vd, vs1, xs2
+vge.[b,h,w].{u}.vv.{m} vd, vs1, vs2 \
+vge.[b,h,w].{u}.vx.{m} vd, vs1, xs2
 
 **Operation**
 
 ```
 for L in Op.typelen
-  vd[L] = vs1[L] > vs2[L] ? 1 : 0
+  vd[L] = vs1[L] >= vs2[L] ? 1 : 0
 ```
 
 --------------------------------------------------------------------------------
diff --git a/examples/hello_world/BUILD b/examples/hello_world/BUILD
index 6f91382..5e7afc3 100644
--- a/examples/hello_world/BUILD
+++ b/examples/hello_world/BUILD
@@ -5,4 +5,7 @@
     srcs = [
         "hello_world.c",
     ],
+    deps = [
+        "//crt:crt_header",
+    ]
 )
diff --git a/examples/tflm/person_detection/BUILD b/examples/tflm/person_detection/BUILD
new file mode 100644
index 0000000..4dcbb9a
--- /dev/null
+++ b/examples/tflm/person_detection/BUILD
@@ -0,0 +1,43 @@
+load("//build_tools/bazel:kelvin.bzl", "kelvin_binary", "generate_cc_arrays")
+package(default_visibility = ["//visibility:public"])
+
+kelvin_binary(
+    name = "person_detection",
+    srcs = [
+        "person_detection.cc",
+        "person_detect_tflite.cc",
+        "person_bmp.cc",
+    ],
+    hdrs = [
+        "person_bmp.h",
+        "person_detect_tflite.h",
+    ],
+    deps = [
+        "//crt:crt_header",
+        "@tflite-micro//tensorflow/lite/micro:micro_framework",
+        "@tflite-micro//tensorflow/lite/micro:system_setup",
+    ],
+)
+
+generate_cc_arrays(
+    name = "person_bmp_cc",
+    src = "@tflite-micro//tensorflow/lite/micro/examples/person_detection:testdata/person.bmp",
+    out = "person_bmp.cc",
+)
+generate_cc_arrays(
+    name = "person_bmp_h",
+    src = "@tflite-micro//tensorflow/lite/micro/examples/person_detection:testdata/person.bmp",
+    out = "person_bmp.h",
+)
+
+generate_cc_arrays(
+    name = "person_detect_tflite_cc",
+    src = "@tflite-micro//tensorflow/lite/micro/models:person_detect.tflite",
+    out = "person_detect_tflite.cc",
+)
+
+generate_cc_arrays(
+    name = "person_detect_tflite_h",
+    src = "@tflite-micro//tensorflow/lite/micro/models:person_detect.tflite",
+    out = "person_detect_tflite.h",
+)
diff --git a/examples/tflm/person_detection/person_detection.cc b/examples/tflm/person_detection/person_detection.cc
new file mode 100644
index 0000000..4b608c6
--- /dev/null
+++ b/examples/tflm/person_detection/person_detection.cc
@@ -0,0 +1,57 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "examples/tflm/person_detection/person_bmp.h"
+#include "examples/tflm/person_detection/person_detect_tflite.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/micro/system_setup.h"
+
+namespace {
+const tflite::Model* model = nullptr;
+tflite::MicroInterpreter* interpreter = nullptr;
+constexpr int kTensorArenaSize = 96 * 1024;
+uint8_t tensor_arena[kTensorArenaSize] __attribute__((aligned(64)));
+}  // namespace
+
+extern "C" int main(int argc, char** argv) {
+  tflite::InitializeTarget();
+
+  model = tflite::GetModel(g_person_detect_model_data);
+
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    return 1;
+  }
+
+  static tflite::MicroMutableOpResolver<5> micro_op_resolver;
+  micro_op_resolver.AddAveragePool2D();
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddDepthwiseConv2D();
+  micro_op_resolver.AddReshape();
+  micro_op_resolver.AddSoftmax();
+
+  static tflite::MicroInterpreter static_interpreter(
+      model, micro_op_resolver, tensor_arena, kTensorArenaSize);
+  interpreter = &static_interpreter;
+
+  TfLiteStatus allocate_status = interpreter->AllocateTensors();
+  if (allocate_status != kTfLiteOk) {
+    return 2;
+  }
+
+  TfLiteTensor* input = interpreter->input(0);
+  TfLiteTensor* output = interpreter->output(0);
+
+  memcpy(input->data.uint8, g_person_image_data, input->bytes);
+  TfLiteStatus invoke_status = interpreter->Invoke();
+  if (invoke_status != kTfLiteOk) {
+    return 3;
+  }
+
+  int8_t person = output->data.int8[1];
+  int8_t not_person = output->data.int8[0];
+  MicroPrintf("person: %d not_person: %d", person, not_person);
+
+  return 0;
+}
diff --git a/examples/tflm/soundstream/BUILD b/examples/tflm/soundstream/BUILD
new file mode 100644
index 0000000..1188d98
--- /dev/null
+++ b/examples/tflm/soundstream/BUILD
@@ -0,0 +1,60 @@
+load("//build_tools/bazel:kelvin.bzl", "generate_cc_arrays", "kelvin_binary")
+package(default_visibility = ["//visibility:public"])
+
+kelvin_binary(
+    name = "soundstream",
+    srcs = [
+        "soundstream.cc",
+        "best_of_times_s16_wav.cc",
+        "decoder_non_stream_q16x8_b64_io_int16_tflite.cc",
+        "encoder_non_stream_q16x8_b64_io_int16_tflite.cc",
+    ],
+    hdrs = [
+        "best_of_times_s16_wav.h",
+        "decoder_non_stream_q16x8_b64_io_int16_tflite.h",
+        "encoder_non_stream_q16x8_b64_io_int16_tflite.h",
+    ],
+    deps = [
+        "//crt:crt_header",
+        "@tflite-micro//tensorflow/lite/micro:micro_framework",
+        "@tflite-micro//tensorflow/lite/micro:system_setup",
+    ],
+    tags = ["manual"],
+)
+
+generate_cc_arrays(
+    name = "decoder_non_stream_q16x8_b64_io_int16_tflite_cc",
+    src = "@ml-models//:quant_models/_decoder_non_stream_q16x8_b64_io_int16.tflite",
+    out = "decoder_non_stream_q16x8_b64_io_int16_tflite.cc",
+    tags = ["manual"],
+)
+generate_cc_arrays(
+    name = "decoder_non_stream_q16x8_b64_io_int16_tflite_h",
+    src = "@ml-models//:quant_models/_decoder_non_stream_q16x8_b64_io_int16.tflite",
+    out = "decoder_non_stream_q16x8_b64_io_int16_tflite.h",
+    tags = ["manual"],
+)
+
+generate_cc_arrays(
+    name = "encoder_non_stream_q16x8_b64_io_int16_tflite_cc",
+    src = "@ml-models//:quant_models/_encoder_non_stream_q16x8_b64_io_int16.tflite",
+    out = "encoder_non_stream_q16x8_b64_io_int16_tflite.cc",
+    tags = ["manual"],
+)
+generate_cc_arrays(
+    name = "encoder_non_stream_q16x8_b64_io_int16_tflite_h",
+    src = "@ml-models//:quant_models/_encoder_non_stream_q16x8_b64_io_int16.tflite",
+    out = "encoder_non_stream_q16x8_b64_io_int16_tflite.h",
+    tags = ["manual"],
+)
+
+generate_cc_arrays(
+    name = "best_of_times_s16_wav_cc",
+    src = "best_of_times_s16.wav",
+    out = "best_of_times_s16_wav.cc",
+)
+generate_cc_arrays(
+    name = "best_of_times_s16_wav_h",
+    src = "best_of_times_s16.wav",
+    out = "best_of_times_s16_wav.h",
+)
diff --git a/examples/tflm/soundstream/best_of_times_s16.wav b/examples/tflm/soundstream/best_of_times_s16.wav
new file mode 100644
index 0000000..e3a7685
--- /dev/null
+++ b/examples/tflm/soundstream/best_of_times_s16.wav
Binary files differ
diff --git a/examples/tflm/soundstream/soundstream.cc b/examples/tflm/soundstream/soundstream.cc
new file mode 100644
index 0000000..b2327fd
--- /dev/null
+++ b/examples/tflm/soundstream/soundstream.cc
@@ -0,0 +1,102 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "examples/tflm/soundstream/best_of_times_s16_wav.h"
+#include "examples/tflm/soundstream/decoder_non_stream_q16x8_b64_io_int16_tflite.h"
+#include "examples/tflm/soundstream/encoder_non_stream_q16x8_b64_io_int16_tflite.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+
+namespace {
+const tflite::Model *encoder_model = nullptr;
+const tflite::Model *decoder_model = nullptr;
+tflite::MicroInterpreter *encoder_interpreter = nullptr;
+tflite::MicroInterpreter *decoder_interpreter = nullptr;
+constexpr int kTensorArenaSize =
+    96 * 1024;
+uint8_t encoder_tensor_arena[kTensorArenaSize] __attribute__((aligned(16)));
+uint8_t decoder_tensor_arena[kTensorArenaSize] __attribute__((aligned(16)));
+}  // namespace
+
+int main(int argc, char **argv) {
+  encoder_model =
+      tflite::GetModel(g__encoder_non_stream_q16x8_b64_io_int16_model_data);
+  if (encoder_model->version() != TFLITE_SCHEMA_VERSION) {
+    return 1;
+  }
+  decoder_model =
+      tflite::GetModel(g__decoder_non_stream_q16x8_b64_io_int16_model_data);
+  if (decoder_model->version() != TFLITE_SCHEMA_VERSION) {
+    return 1;
+  }
+
+  static tflite::MicroMutableOpResolver<6> encoder_resolver{};
+  encoder_resolver.AddReshape();
+  encoder_resolver.AddPad();
+  encoder_resolver.AddConv2D();
+  encoder_resolver.AddLeakyRelu();
+  encoder_resolver.AddDepthwiseConv2D();
+  encoder_resolver.AddAdd();
+
+  static tflite::MicroMutableOpResolver<11> decoder_resolver{};
+  decoder_resolver.AddReshape();
+  decoder_resolver.AddPad();
+  decoder_resolver.AddConv2D();
+  decoder_resolver.AddLeakyRelu();
+  decoder_resolver.AddSplit();
+  decoder_resolver.AddTransposeConv();
+  decoder_resolver.AddStridedSlice();
+  decoder_resolver.AddConcatenation();
+  decoder_resolver.AddDepthwiseConv2D();
+  decoder_resolver.AddAdd();
+  decoder_resolver.AddQuantize();
+
+  static tflite::MicroInterpreter encoder_static_interpreter(
+      encoder_model, encoder_resolver, encoder_tensor_arena, kTensorArenaSize);
+  encoder_interpreter = &encoder_static_interpreter;
+
+  static tflite::MicroInterpreter decoder_static_interpreter(
+      decoder_model, decoder_resolver, decoder_tensor_arena, kTensorArenaSize);
+  decoder_interpreter = &decoder_static_interpreter;
+
+  TfLiteStatus allocate_status = encoder_interpreter->AllocateTensors();
+  if (allocate_status != kTfLiteOk) {
+    MicroPrintf("Failed to allocate encoder's tensors");
+    return -1;
+  }
+  allocate_status = decoder_interpreter->AllocateTensors();
+  if (allocate_status != kTfLiteOk) {
+    MicroPrintf("Failed to allocate decoder's tensors");
+    return -1;
+  }
+
+  TfLiteTensor *encoder_input = encoder_interpreter->input(0);
+  TfLiteTensor *encoder_output = encoder_interpreter->output(0);
+
+  int invocation_count =
+      g_best_of_times_s16_audio_data_size / encoder_input->bytes;
+  for (int i = 0; i < invocation_count; ++i) {
+    MicroPrintf("Invocation %d of %d", i, invocation_count);
+    memcpy(encoder_input->data.uint8,
+           g_best_of_times_s16_audio_data +
+               ((i * encoder_input->bytes) / sizeof(int16_t)),
+           encoder_input->bytes);
+    TfLiteStatus invoke_status = encoder_interpreter->Invoke();
+    if (invoke_status != kTfLiteOk) {
+      MicroPrintf("Failed to invoke encoder");
+      return -1;
+    }
+
+    TfLiteTensor *decoder_input = decoder_interpreter->input(0);
+    memcpy(decoder_input->data.uint8, encoder_output->data.uint8,
+           decoder_input->bytes);
+    invoke_status = decoder_interpreter->Invoke();
+    if (invoke_status != kTfLiteOk) {
+      MicroPrintf("Failed to invoke decoder");
+      return -1;
+    }
+  }
+
+  return 0;
+}
diff --git a/platforms/riscv32/features/BUILD b/platforms/riscv32/features/BUILD
index 98de11f..737a988 100644
--- a/platforms/riscv32/features/BUILD
+++ b/platforms/riscv32/features/BUILD
@@ -65,22 +65,51 @@
     ],
 )
 
+feature(
+    name = "all_warnings",
+    enabled = True,
+    flag_sets = [
+        flag_set(
+            actions = CPP_ALL_COMPILE_ACTIONS + C_ALL_COMPILE_ACTIONS,
+            flag_groups = [
+                flag_group(
+                    flags = [
+                        "-Wall",
+                    ],
+                ),
+            ],
+        ),
+    ],
+)
+
+feature(
+    name = "all_warnings_as_errors",
+    enabled = False,
+    flag_sets = [
+        flag_set(
+            actions = CPP_ALL_COMPILE_ACTIONS + C_ALL_COMPILE_ACTIONS,
+            flag_groups = [
+                flag_group(
+                    flags = ["-Werror"],
+                ),
+            ],
+        ),
+    ],
+)
+
 feature_set(
     name = "rv32im",
     feature = [
         ":architecture",
         ":sys_spec",
+        ":all_warnings",
+        ":all_warnings_as_errors",
+        "@crt//features/common:includes",
+        "@crt//features/common:reproducible",
+        "@crt//features/common:symbol_garbage_collection",
         "@crt//features/embedded:cc_constructor_destructor",
         "@crt//features/embedded:exceptions",
         "@crt//features/embedded:runtime_type_information",
-        "@crt//platforms/riscv32/features:all_warnings_as_errors",
         "@crt//platforms/riscv32/features:fastbuild",
-        "@crt//features/common:includes",
-        "@crt//features/common:all_warnings",
-        "@crt//features/common:all_warnings_as_errors",
-        "@crt//features/common:reproducible",
-        # TODO(atv): It would be nice to have the feature, but for now enabling
-        # this creates the wrong program.
-        # "@crt//features/common:symbol_garbage_collection",
     ],
 )
diff --git a/tests/kelvin_isa/BUILD b/tests/kelvin_isa/BUILD
index 610ad28..a129b68 100644
--- a/tests/kelvin_isa/BUILD
+++ b/tests/kelvin_isa/BUILD
@@ -1,17 +1,17 @@
-load("//build_tools/bazel:kelvin.bzl", "kelvin_binary")
+load("//build_tools/bazel:kelvin.bzl", "kelvin_test")
 
-kelvin_binary(
-  name = "getvl_test",
-  srcs = [
-    "getvl_test.cc",
-  ],
-  hdrs = [
-    "kelvin_test.h",
-  ],
-  copts = [
-    "-Wno-address",
-  ],
-  deps = [
-    "//crt:crt_header",
-  ]
+kelvin_test(
+    name = "getvl_test",
+    srcs = [
+        "getvl_test.cc",
+    ],
+    hdrs = [
+        "kelvin_test.h",
+    ],
+    copts = [
+        "-Wno-address",
+    ],
+    deps = [
+        "//crt:crt_header",
+    ],
 )
diff --git a/tests/riscv-tests/BUILD b/tests/riscv-tests/BUILD
index 811032e..2b8f273 100644
--- a/tests/riscv-tests/BUILD
+++ b/tests/riscv-tests/BUILD
@@ -1,6 +1,6 @@
 # Build riscv-tests based on Kelvin linker script and termination condition.
 
-load("//build_tools/bazel:kelvin.bzl", "kelvin_binary")
+load("//build_tools/bazel:kelvin.bzl", "kelvin_test")
 
 cc_library(
     name = "riscv_tests_base",
@@ -11,6 +11,9 @@
         "riscv_test.h",
         "@riscv-tests//:isa/macros/scalar/test_macros.h",
     ],
+    deps = [
+        "//crt:crt_header",
+    ],
 )
 
 RV32UI_TESTS = [
@@ -55,7 +58,7 @@
     "xor",
 ]
 
-[kelvin_binary(
+[kelvin_test(
     name = "rv32ui_{}".format(test),
     srcs = [
         # riscv-tests use the rv64 code to set up rv32 tests.
@@ -88,7 +91,7 @@
     "remu",
 ]
 
-[kelvin_binary(
+[kelvin_test(
     name = "rv32um_{}".format(test),
     srcs = [
         "@riscv-tests//:isa/rv32um/{}.S".format(test),
@@ -104,7 +107,7 @@
     ],
 ) for test in RV32UM_TESTS]
 
-kelvin_binary(
+kelvin_test(
     name = "branch_modulo_test",
     srcs = [
         "branch_modulo_test.cc",
@@ -114,7 +117,7 @@
     ],
 )
 
-kelvin_binary(
+kelvin_test(
     name = "branch_div_test",
     srcs = [
         "branch_div_test.cc",
diff --git a/tests/riscv-tests/branch_div_test.cc b/tests/riscv-tests/branch_div_test.cc
index 74344fc..a7a6514 100644
--- a/tests/riscv-tests/branch_div_test.cc
+++ b/tests/riscv-tests/branch_div_test.cc
@@ -35,13 +35,13 @@
   uint32_t a = static_cast<uint32_t>(
       ((rand() & 0xffff) << 16) |  // NOLINT(runtime/threadsafe_fn)
       (rand() & 0xffff));          // NOLINT(runtime/threadsafe_fn)
-  return (a > 0) ? a : 1;
+  return std::max<uint32_t>(a, 1);
 }
 
 int32_t rand_int32_input() {
   int32_t a = ((rand() & 0x7fff) << 16) |  // NOLINT(runtime/threadsafe_fn)
               (rand() & 0xffff);           // NOLINT(runtime/threadsafe_fn)
-  return (a > 0) ? a : 1;
+  return std::max<int32_t>(a, 1);
 }
 
 int main() {
diff --git a/tests/riscv-tests/branch_modulo_test.cc b/tests/riscv-tests/branch_modulo_test.cc
index 54ec856..8bd7743 100644
--- a/tests/riscv-tests/branch_modulo_test.cc
+++ b/tests/riscv-tests/branch_modulo_test.cc
@@ -23,13 +23,13 @@
   uint32_t a = static_cast<uint32_t>(
       ((rand() & 0xffff) << 16) |  // NOLINT(runtime/threadsafe_fn)
       (rand() & 0xffff));          // NOLINT(runtime/threadsafe_fn)
-  return (a > 0) ? a : 1;
+  return std::max<uint32_t>(a, 1);
 }
 
 int32_t rand_int32_input() {
   int32_t a = ((rand() & 0x7fff) << 16) |  // NOLINT(runtime/threadsafe_fn)
               (rand() & 0xffff);           // NOLINT(runtime/threadsafe_fn)
-  return (a > 0) ? a : 1;
+  return std::max<int32_t>(a, 1);
 }
 
 int main() {
diff --git a/tests/tflm/BUILD b/tests/tflm/BUILD
new file mode 100644
index 0000000..de9127f
--- /dev/null
+++ b/tests/tflm/BUILD
@@ -0,0 +1,18 @@
+load("//build_tools/bazel:kelvin.bzl", "kelvin_test")
+package(default_visibility = ["//visibility:public"])
+
+kelvin_test(
+    name = "reshape_test",
+    srcs = [
+        "@tflite-micro//tensorflow/lite/micro/kernels:reshape_test.cc",
+    ],
+    deps = [
+        "//crt:crt_header",
+        "@tflite-micro//tensorflow/lite/c:common",
+        "@tflite-micro//tensorflow/lite/kernels/internal:tensor",
+        "@tflite-micro//tensorflow/lite/micro/kernels:kernel_runner",
+        "@tflite-micro//tensorflow/lite/micro/testing:micro_test",
+        "@tflite-micro//tensorflow/lite/micro:micro_utils",
+        "@tflite-micro//tensorflow/lite/micro:test_helpers",
+    ],
+)
diff --git a/tflm/opt/BUILD b/tflm/opt/BUILD
new file mode 100644
index 0000000..abf99e6
--- /dev/null
+++ b/tflm/opt/BUILD
@@ -0,0 +1,16 @@
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "opt",
+    srcs = [
+        "memcpy.cc",
+    ],
+    hdrs = [
+        "opt.h",
+    ],
+    deps = [
+        "//crt:crt_header",
+    ],
+    alwayslink = True,
+    target_compatible_with = ["@kelvin_sw//platforms/cpu:kelvin"],
+)
diff --git a/tflm/opt/memcpy.cc b/tflm/opt/memcpy.cc
new file mode 100644
index 0000000..24df6f3
--- /dev/null
+++ b/tflm/opt/memcpy.cc
@@ -0,0 +1,29 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "crt/kelvin.h"
+
+namespace kelvin::opt {
+
+void *memcpy(void *dst, const void *src, size_t n) {
+  const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
+  uint8_t *d = reinterpret_cast<uint8_t *>(dst);
+  int vl;
+  while (true) {
+    if (n <= 0) break;
+    getvl_b_x_m(vl, n);
+    n -= vl;
+    vld_b_lp_xx_m(v0, s, vl);
+    vst_b_lp_xx_m(v0, d, vl);
+
+    if (n <= 0) break;
+    getvl_b_x_m(vl, n);
+    n -= vl;
+    vld_b_lp_xx_m(v4, s, vl);
+    vst_b_lp_xx_m(v4, d, vl);
+  }
+  return dst;
+}
+
+}  // namespace kelvin::opt
diff --git a/tflm/opt/opt.h b/tflm/opt/opt.h
new file mode 100644
index 0000000..5574daf
--- /dev/null
+++ b/tflm/opt/opt.h
@@ -0,0 +1,12 @@
+// Copyright 2023 Google LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef OPT_OPT_H_
+#define OPT_OPT_H_
+
+namespace kelvin::opt {
+void *memcpy(void *dst, const void *src, size_t n);
+}  // namespace kelvin::opt
+
+#endif  // OPT_OPT_H_
diff --git a/third_party/ml-models/BUILD b/third_party/ml-models/BUILD
new file mode 100644
index 0000000..3f152f8
--- /dev/null
+++ b/third_party/ml-models/BUILD
@@ -0,0 +1,6 @@
+# Copyright 2023 Google LLC
+package(default_visibility = ["//visibility:public"])
+
+exports_files(
+    glob(["quant_models/*.tflite"])
+)