Merge remote-tracking branch 'spacebeaker/upstream' into master

Change-Id: Ieb5c6acc9bbafad3813c83058a9e40089fee2e3c
diff --git a/tensorflow/extra_rules.bzl b/tensorflow/extra_rules.bzl
index 4a111dc..29e0bda 100644
--- a/tensorflow/extra_rules.bzl
+++ b/tensorflow/extra_rules.bzl
@@ -1,5 +1,7 @@
 def tflm_kernel_friends():
-    return []
+    return [
+        "public",
+    ]
 
 def tflm_audio_frontend_friends():
     return []
@@ -32,3 +34,7 @@
 def xtensa_vision_p6_config():
     """Config setting for all Vision P6 based cores."""
     return "//tensorflow/lite/micro/kernels:xtensa_vision_p6_default"
+
+def kelvin_config():
+    """Config setting for Kelvin-based cores."""
+    return "//tensorflow/lite/micro/kernels:kelvin_default"
diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index 1753465..58ea22d 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -334,8 +334,11 @@
     hdrs = [
         "micro_time.h",
     ],
-    copts = micro_copts() + ["-DTF_LITE_USE_CTIME"],
-    deps = ["//tensorflow/lite/c:common"],
+    copts = micro_copts(),
+    deps = [
+        "//tensorflow/lite/c:common",
+        "@kelvin_sw//benchmarks:cycle_count",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index f2ccb06..22254d9 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -8,7 +8,9 @@
     "xtensa_hifi_3z_config",
     "xtensa_hifi_5_config",
     "xtensa_vision_p6_config",
+    "kelvin_config",
 )
+load("@bazel_skylib//lib:selects.bzl", "selects")
 
 package(
     features = [
@@ -33,6 +35,10 @@
     packages = tflm_kernel_friends(),
 )
 
+exports_files(
+    glob(["*_test.cc"])
+)
+
 ####################################
 # C++ libraries
 ####################################
@@ -66,6 +72,9 @@
     hdrs = [
         "conv_test.h",
     ],
+    visibility = [
+        "//visibility:public",
+    ],
     copts = micro_copts(),
     deps = [
         ":kernel_runner",
@@ -191,6 +200,9 @@
     "-DVISION_P6=1",
 ]
 
+KELVIN_COPTS = [
+]
+
 tflm_kernel_cc_library(
     name = "micro_ops",
     srcs = [
@@ -331,6 +343,7 @@
         xtensa_hifi_3z_config(): glob(["xtensa/**/*.h"]),
         xtensa_hifi_5_config(): glob(["xtensa/**/*.h"]),
         xtensa_vision_p6_config(): glob(["xtensa/**/*.h"]),
+        kelvin_config(): glob(["kelvin/**/*.h"]),
         "//conditions:default": [],
     }),
     accelerated_srcs = {
@@ -339,6 +352,7 @@
         xtensa_hifi_3z_config(): glob(["xtensa/**/*.cc"]),
         xtensa_hifi_5_config(): glob(["xtensa/**/*.cc"]),
         xtensa_vision_p6_config(): glob(["xtensa/**/*.cc"]),
+        kelvin_config(): glob(["kelvin/**/*.cc"]),
     },
     copts = micro_copts() + select({
         xtensa_fusion_f1_config(): HIFI4_COPTS,
@@ -346,6 +360,7 @@
         xtensa_hifi_3z_config(): HIFI4_COPTS,
         xtensa_hifi_5_config(): HIFI5_COPTS,
         xtensa_vision_p6_config(): VP6_COPTS,
+        kelvin_config(): KELVIN_COPTS,
         "//conditions:default": [],
     }),
     visibility = [
@@ -382,6 +397,7 @@
         xtensa_hifi_3z_config(): ["//third_party/xtensa/nnlib_hifi4:nnlib_hifi4_lib"],
         xtensa_hifi_5_config(): ["//third_party/xtensa/nnlib_hifi5:nnlib_hifi5_lib"],
         xtensa_vision_p6_config(): ["//third_party/xtensa/xi_tflmlib_vision_p6:xi_tflmlib_vision_p6_lib"],
+        kelvin_config(): ["@kelvin_sw//tflm/opt:opt"],
         "//conditions:default": [],
     }),
 )
@@ -1515,3 +1531,22 @@
         ":optimized_kernels": "xtensa_vision_p6",
     },
 )
+
+config_setting(
+    name = "kelvin_default1",
+    values = {
+        "platforms": "@kelvin_sw//platforms/riscv32:kelvin",
+    },
+)
+
+config_setting(
+    name = "kelvin_default2",
+    values = {
+        "platforms": "//platforms/riscv32:kelvin",
+    },
+)
+
+selects.config_setting_group(
+    name = "kelvin_default",
+    match_any = [":kelvin_default1", ":kelvin_default2"],
+)
diff --git a/tensorflow/lite/micro/kernels/add_test.cc b/tensorflow/lite/micro/kernels/add_test.cc
index 6e8b40c..bdf0224 100644
--- a/tensorflow/lite/micro/kernels/add_test.cc
+++ b/tensorflow/lite/micro/kernels/add_test.cc
@@ -256,16 +256,42 @@
 TF_LITE_MICRO_TEST(QuantizedAddNoActivationInt8) {
   const float scales[] = {0.25, 0.5, 1.0};
   const int zero_points[] = {-10, 4, 13};
-  int inout_shape[] = {4, 1, 2, 2, 1};
-  const float input1_values[] = {-2.01, -1.01, -0.01, 0.98};
-  const float input2_values[] = {1.01, 1.99, 2.99, 4.02};
-  const float golden_values[] = {-1, 1, 3, 5};
+  int inout_shape[] = {4, 1, 7, 6, 1};
+  // clang-format off
+  const float input1_values[] = {
+      -2.01, -1.01, -0.01, 0.98, -2.01, -1.01,
+      -0.01, 0.98, -2.01, -1.01, -0.01, 0.98,
+      -2.01, -1.01, -0.01, 0.98,  -2.01, -1.01,
+      -0.01, 0.98, -2.01, -1.01, -0.01, 0.98,
+      -2.01, -1.01, -0.01, 0.98, -2.01, -1.01,
+      -0.01, 0.98, -2.01, -1.01, -0.01, 0.98,
+      -2.0, 0.2, 0.7, 0.8, 1.1, 2.0
+  };
+  const float input2_values[] = {
+      1.01, 1.99, 2.99, 4.02, 1.01, 1.99,
+      2.99, 4.02, 1.01, 1.99, 2.99, 4.02,
+      1.01, 1.99, 2.99, 4.02, 1.01, 1.99,
+      2.99, 4.02, 1.01, 1.99, 2.99, 4.02,
+      1.01, 1.99, 2.99, 4.02, 1.01, 1.99,
+      2.99, 4.02, 1.01, 1.99, 2.99, 4.02,
+      0.1,  0.2,  0.3,  0.5,  1.1,  0.1
+  };
+  const float golden_values[] = {
+      -1, 1, 3, 5, -1, 1,
+      3, 5, -1, 1, 3, 5,
+      -1, 1, 3, 5, -1, 1,
+      3, 5, -1, 1, 3, 5,
+      -1, 1, 3, 5, -1, 1,
+      3, 5, -1, 1, 3, 5,
+      -1.9, 0.4, 1.0, 1.3, 2.2, 2.1
+  };
+  // clang-format on
 
-  constexpr int kOutputDimsCount = 4;
-  int8_t input1_quantized[kOutputDimsCount];
-  int8_t input2_quantized[kOutputDimsCount];
-  int8_t golden_quantized[kOutputDimsCount];
-  int8_t output[kOutputDimsCount];
+  constexpr int kOutputDimsCount = 42;
+  int8_t input1_quantized[kOutputDimsCount] __attribute__((aligned(64)));
+  int8_t input2_quantized[kOutputDimsCount] __attribute__((aligned(64)));
+  int8_t golden_quantized[kOutputDimsCount] __attribute__((aligned(64)));
+  int8_t output[kOutputDimsCount] __attribute__((aligned(64)));
 
   tflite::testing::TestAddQuantized(
       inout_shape, input1_values, input1_quantized, scales[0], zero_points[0],
diff --git a/tensorflow/lite/micro/kernels/kelvin/add.cc b/tensorflow/lite/micro/kernels/kelvin/add.cc
new file mode 100644
index 0000000..8c33716
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/kelvin/add.cc
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/micro/kernels/add.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tflm/opt/opt.h"
+
+namespace tflite {
+namespace {
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataAdd));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return AddPrepare(context, node);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataAdd* data = static_cast<const OpDataAdd*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kAddInputTensor1);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kAddInputTensor2);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kAddOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    tflite::ArithmeticParams op_params;
+    SetActivationParams(data->output_activation_min_f32,
+                        data->output_activation_max_f32, &op_params);
+    if (data->requires_broadcast) {
+      reference_ops::BroadcastAdd4DSlow(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<float>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<float>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+    } else {
+      reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1),
+                         tflite::micro::GetTensorData<float>(input1),
+                         tflite::micro::GetTensorShape(input2),
+                         tflite::micro::GetTensorData<float>(input2),
+                         tflite::micro::GetTensorShape(output),
+                         tflite::micro::GetTensorData<float>(output));
+    }
+  } else if (output->type == kTfLiteInt32) {
+    tflite::ArithmeticParams op_params;
+    SetActivationParams(std::numeric_limits<int32_t>::lowest(),
+                        std::numeric_limits<int32_t>::max(), &op_params);
+    if (data->requires_broadcast) {
+      reference_ops::BroadcastAdd4DSlow(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<int32_t>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<int32_t>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int32_t>(output));
+    } else {
+      kelvin::opt::ElementwiseAddS32(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<int32_t>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<int32_t>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int32_t>(output));
+    }
+  } else if (output->type == kTfLiteInt16) {
+    tflite::ArithmeticParams op_params;
+    op_params.left_shift = data->left_shift;
+    op_params.input1_offset = data->input1_offset;
+    op_params.input1_multiplier = data->input1_multiplier;
+    op_params.input1_shift = data->input1_shift;
+    op_params.input2_offset = data->input2_offset;
+    op_params.input2_multiplier = data->input2_multiplier;
+    op_params.input2_shift = data->input2_shift;
+    op_params.output_offset = data->output_offset;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+
+    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
+
+    if (need_broadcast) {
+      reference_ops::BroadcastAdd4DSlow(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<int16_t>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<int16_t>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int16_t>(output));
+    } else {
+      kelvin::opt::ElementwiseAddS16(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<int16_t>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<int16_t>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int16_t>(output));
+    }
+  } else if (output->type == kTfLiteInt8) {
+    tflite::ArithmeticParams op_params;
+    op_params.left_shift = data->left_shift;
+    op_params.input1_offset = data->input1_offset;
+    op_params.input1_multiplier = data->input1_multiplier;
+    op_params.input1_shift = data->input1_shift;
+    op_params.input2_offset = data->input2_offset;
+    op_params.input2_multiplier = data->input2_multiplier;
+    op_params.input2_shift = data->input2_shift;
+    op_params.output_offset = data->output_offset;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+
+    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
+
+    if (need_broadcast) {
+      reference_integer_ops::BroadcastAdd4DSlow(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<int8_t>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<int8_t>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+    } else {
+      kelvin::opt::ElementwiseAddS8(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<int8_t>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<int8_t>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+    }
+  } else {
+    MicroPrintf("Unsupported output type: %s", TfLiteTypeGetName(output->type));
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TFLMRegistration Register_ADD() {
+  return tflite::micro::RegisterOp(Init, Prepare, Eval);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/kelvin/conv.cc b/tensorflow/lite/micro/kernels/kelvin/conv.cc
new file mode 100644
index 0000000..d8fb8a1
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/kelvin/conv.cc
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/conv.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tflm/opt/opt.h"
+
+namespace tflite {
+namespace {
+
+constexpr int kFilterHeightIndex = 1;
+constexpr int kFilterWidthIndex = 2;
+constexpr int kFilterInputChannelIndex = 3;
+constexpr int kInputChannelIndex = 3;
+constexpr int kOutputChannelIndex = 3;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataConv));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kConvInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kConvWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kConvBiasTensor)
+          : nullptr;
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kConvOutputTensor);
+
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto& params =
+      *(reinterpret_cast<TfLiteConvParams*>(node->builtin_data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const auto& data = *(static_cast<const OpDataConv*>(node->user_data));
+
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(
+      context,
+      input->type == filter->type ||
+          (input->type == kTfLiteInt16 && filter->type == kTfLiteInt8) ||
+          (input->type == kTfLiteInt8 && filter->type == kTfLiteInt4),
+      "Hybrid models are not supported on TFLite Micro.");
+
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32: {
+      tflite::reference_ops::Conv(
+          ConvParamsFloat(params, data), tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetOptionalTensorData<float>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output),
+          tflite::micro::GetTensorShape(nullptr), nullptr);
+      break;
+    }
+    case kTfLiteInt16: {
+      const auto params_q = ConvParamsQuantized(params, data);
+      bool opt = !(params_q.padding_values.width > 0 ||
+                   params_q.padding_values.height > 0 ||
+                   params_q.dilation_width_factor > 1 ||
+                   params_q.dilation_height_factor > 1);
+      switch (bias->type) {
+        case kTfLiteInt32: {
+          const auto fn = opt ? kelvin::opt::ConvS16B32
+                              : reference_integer_ops::ConvPerChannel<int32_t>;
+          fn(params_q, data.per_channel_output_multiplier,
+             data.per_channel_output_shift,
+             tflite::micro::GetTensorShape(input),
+             tflite::micro::GetTensorData<int16_t>(input),
+             tflite::micro::GetTensorShape(filter),
+             tflite::micro::GetTensorData<int8_t>(filter),
+             tflite::micro::GetTensorShape(bias),
+             tflite::micro::GetOptionalTensorData<std::int32_t>(bias),
+             tflite::micro::GetTensorShape(output),
+             tflite::micro::GetTensorData<int16_t>(output));
+          break;
+        }
+        case kTfLiteInt64: {
+          const auto fn = opt ? kelvin::opt::ConvS16B64
+                              : reference_integer_ops::ConvPerChannel<int64_t>;
+          fn(params_q, data.per_channel_output_multiplier,
+             data.per_channel_output_shift,
+             tflite::micro::GetTensorShape(input),
+             tflite::micro::GetTensorData<int16_t>(input),
+             tflite::micro::GetTensorShape(filter),
+             tflite::micro::GetTensorData<int8_t>(filter),
+             tflite::micro::GetTensorShape(bias),
+             tflite::micro::GetOptionalTensorData<std::int64_t>(bias),
+             tflite::micro::GetTensorShape(output),
+             tflite::micro::GetTensorData<int16_t>(output));
+          break;
+        }
+        default:
+          MicroPrintf("Bias type %s (%d) not supported.",
+                      TfLiteTypeGetName(bias->type), bias->type);
+          return kTfLiteError;
+      }
+      break;
+    }
+    case kTfLiteInt8: {
+      switch (filter->type) {
+        case kTfLiteInt4: {
+          int8_t* unpacked_filter_data = reinterpret_cast<int8_t*>(
+              context->GetScratchBuffer(context, data.filter_buffer_index));
+          tflite::tensor_utils::UnpackDenseInt4IntoInt8(
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(filter).FlatSize(),
+              unpacked_filter_data);
+          reference_integer_ops::ConvPerChannel(
+              ConvParamsQuantized(params, data),
+              data.per_channel_output_multiplier, data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int8_t>(input),
+              tflite::micro::GetTensorShape(filter), unpacked_filter_data,
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int32_t>(bias),
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int8_t>(output));
+          break;
+        }
+        case kTfLiteInt8: {
+          const auto params_q = ConvParamsQuantized(params, data);
+          kelvin::opt::ConvS8(
+              params_q, data.per_channel_output_multiplier,
+              data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int8_t>(input),
+              tflite::micro::GetTensorShape(filter),
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int32_t>(bias),
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int8_t>(output));
+          break;
+        }
+        default:
+          MicroPrintf("Weight type %s (%d) not supported.",
+                      TfLiteTypeGetName(filter->type), filter->type);
+          return kTfLiteError;
+      }
+      break;
+    }
+    default:
+      MicroPrintf("Type %s (%d) not supported.", TfLiteTypeGetName(input->type),
+                  input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TFLMRegistration Register_CONV_2D() {
+  return tflite::micro::RegisterOp(Init, ConvPrepare, Eval);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/kelvin/depthwise_conv.cc b/tensorflow/lite/micro/kernels/kelvin/depthwise_conv.cc
new file mode 100644
index 0000000..f8d9307
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/kelvin/depthwise_conv.cc
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/depthwise_conv.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tflm/opt/opt.h"
+
+namespace tflite {
+namespace {
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataConv));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto& params =
+      *(reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data));
+  const OpDataConv& data = *(static_cast<const OpDataConv*>(node->user_data));
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kDepthwiseConvOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kDepthwiseConvInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kDepthwiseConvWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kDepthwiseConvBiasTensor)
+          : nullptr;
+
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32: {
+      tflite::reference_ops::DepthwiseConv(
+          DepthwiseConvParamsFloat(params, data),
+          tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<float>(input),
+          tflite::micro::GetTensorShape(filter),
+          tflite::micro::GetTensorData<float>(filter),
+          tflite::micro::GetTensorShape(bias),
+          tflite::micro::GetOptionalTensorData<float>(bias),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<float>(output));
+      break;
+    }
+    case kTfLiteInt8: {
+      switch (filter->type) {
+        case kTfLiteInt4: {
+          int8_t* unpacked_filter_data = static_cast<int8_t*>(
+              context->GetScratchBuffer(context, data.filter_buffer_index));
+          tflite::tensor_utils::UnpackDenseInt4IntoInt8(
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(filter).FlatSize(),
+              unpacked_filter_data);
+          reference_integer_ops::DepthwiseConvPerChannel(
+              DepthwiseConvParamsQuantized(params, data),
+              data.per_channel_output_multiplier, data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int8_t>(input),
+              tflite::micro::GetTensorShape(filter), unpacked_filter_data,
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int32_t>(bias),
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int8_t>(output));
+          break;
+        }
+        case kTfLiteInt8: {
+          tflite::DepthwiseParams dw_params =
+              DepthwiseConvParamsQuantized(params, data);
+          kelvin::opt::DepthwiseConvS8(
+              dw_params, data.per_channel_output_multiplier,
+              data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int8_t>(input),
+              tflite::micro::GetTensorShape(filter),
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int32_t>(bias),
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int8_t>(output));
+          break;
+        }
+        default:
+          MicroPrintf("Filter type %s (%d) for input type %s not supported.",
+                      TfLiteTypeGetName(filter->type), filter->type,
+                      TfLiteTypeGetName(input->type));
+          return kTfLiteError;
+      }
+      break;
+    }
+    case kTfLiteInt16: {
+      switch (filter->type) {
+        case kTfLiteInt8: {
+          tflite::DepthwiseParams dw_params =
+              DepthwiseConvParamsQuantized(params, data);
+          kelvin::opt::DepthwiseConvS16(
+              dw_params, data.per_channel_output_multiplier,
+              data.per_channel_output_shift,
+              tflite::micro::GetTensorShape(input),
+              tflite::micro::GetTensorData<int16_t>(input),
+              tflite::micro::GetTensorShape(filter),
+              tflite::micro::GetTensorData<int8_t>(filter),
+              tflite::micro::GetTensorShape(bias),
+              tflite::micro::GetOptionalTensorData<int64_t>(bias),
+              tflite::micro::GetTensorShape(output),
+              tflite::micro::GetTensorData<int16_t>(output));
+          break;
+        }
+        default:
+          MicroPrintf("Filter type %s (%d) for input type %s not supported.",
+                      TfLiteTypeGetName(filter->type), filter->type,
+                      TfLiteTypeGetName(input->type));
+          return kTfLiteError;
+      }
+      break;
+    }
+    default:
+      MicroPrintf("Input type %s (%d) not supported.",
+                  TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TFLMRegistration Register_DEPTHWISE_CONV_2D() {
+  return tflite::micro::RegisterOp(Init, DepthwiseConvPrepare, Eval);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/kelvin/leaky_relu.cc b/tensorflow/lite/micro/kernels/kelvin/leaky_relu.cc
new file mode 100644
index 0000000..fafcfed
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/kelvin/leaky_relu.cc
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorflow/lite/kernels/internal/reference/leaky_relu.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/leaky_relu.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tflm/opt/opt.h"
+
+namespace tflite {
+
+namespace {
+void* LeakyReluInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(LeakyReluOpData));
+}
+
+TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  const LeakyReluOpData& data = *static_cast<LeakyReluOpData*>(node->user_data);
+
+  // Kelvin's vector ISA is used to implement Int8 and Int16.
+  // Float32 uses the reference op.
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      LeakyReluParams op_params = {};
+      const auto* params =
+          static_cast<TfLiteLeakyReluParams*>(node->builtin_data);
+
+      op_params.alpha = params->alpha;
+      reference_ops::LeakyRelu(op_params, tflite::micro::GetTensorShape(input),
+                               tflite::micro::GetTensorData<float>(input),
+                               tflite::micro::GetTensorShape(output),
+                               tflite::micro::GetTensorData<float>(output));
+      return kTfLiteOk;
+    } break;
+    case kTfLiteInt8: {
+      LeakyReluParams op_params = {};
+      op_params.input_offset = data.input_zero_point;
+      op_params.output_offset = data.output_zero_point;
+      op_params.output_multiplier_alpha = data.output_multiplier_alpha;
+      op_params.output_shift_alpha = data.output_shift_alpha;
+      op_params.output_multiplier_identity = data.output_multiplier_identity;
+      op_params.output_shift_identity = data.output_shift_identity;
+      kelvin::opt::LeakyReluS8(op_params, tflite::micro::GetTensorShape(input),
+                               tflite::micro::GetTensorData<int8_t>(input),
+                               tflite::micro::GetTensorShape(output),
+                               tflite::micro::GetTensorData<int8_t>(output));
+      return kTfLiteOk;
+    } break;
+    case kTfLiteInt16: {
+      LeakyReluParams op_params = {};
+      op_params.input_offset = data.input_zero_point;
+      op_params.output_offset = data.output_zero_point;
+      op_params.output_multiplier_alpha = data.output_multiplier_alpha;
+      op_params.output_shift_alpha = data.output_shift_alpha;
+      op_params.output_multiplier_identity = data.output_multiplier_identity;
+      op_params.output_shift_identity = data.output_shift_identity;
+      kelvin::opt::LeakyReluS16(op_params, tflite::micro::GetTensorShape(input),
+                                tflite::micro::GetTensorData<int16_t>(input),
+                                tflite::micro::GetTensorShape(output),
+                                tflite::micro::GetTensorData<int16_t>(output));
+      return kTfLiteOk;
+    } break;
+    default:
+      MicroPrintf(
+          "Only float32, int8, int16 are supported by LEAKY_RELU, got %s.",
+          TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+
+  return kTfLiteError;
+}
+
+}  // namespace
+
+TFLMRegistration Register_LEAKY_RELU() {
+  return tflite::micro::RegisterOp(LeakyReluInit, LeakyReluPrepare,
+                                   LeakyReluEval);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/kelvin/logistic.cc b/tensorflow/lite/micro/kernels/kelvin/logistic.cc
new file mode 100644
index 0000000..974ef12
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/kelvin/logistic.cc
@@ -0,0 +1,112 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/logistic.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/logistic.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tflm/opt/opt.h"
+
+namespace tflite {
+namespace {
+
+void* LogisticInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataLogistic));
+}
+
+TfLiteStatus LogisticEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kLogisticInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kLogisticOutputTensor);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpDataLogistic* data = static_cast<OpDataLogistic*>(node->user_data);
+
+  if (input->type == kTfLiteFloat32) {
+    switch (output->type) {
+      case kTfLiteFloat32: {
+        reference_ops::Logistic(tflite::micro::GetTensorShape(input),
+                                tflite::micro::GetTensorData<float>(input),
+                                tflite::micro::GetTensorShape(output),
+                                tflite::micro::GetTensorData<float>(output));
+        return kTfLiteOk;
+      }
+      default:
+        MicroPrintf("Input %s, output %s not supported.",
+                    TfLiteTypeGetName(input->type),
+                    TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else if (input->type == kTfLiteInt16) {
+    switch (output->type) {
+      case kTfLiteInt16: {
+        reference_integer_ops::Logistic(
+            data->input_multiplier, data->input_left_shift,
+            NumElements(input->dims),
+            tflite::micro::GetTensorData<int16_t>(input),
+            tflite::micro::GetTensorData<int16_t>(output));
+        return kTfLiteOk;
+      }
+      default:
+        MicroPrintf("Input %s, output %s not supported.",
+                    TfLiteTypeGetName(input->type),
+                    TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else if (input->type == kTfLiteInt8) {
+    switch (output->type) {
+      case kTfLiteInt8: {
+        kelvin::opt::LogisticS8(
+            data->input_zero_point, data->input_range_radius,
+            data->input_multiplier, data->input_left_shift,
+            NumElements(input->dims),
+            tflite::micro::GetTensorData<int8_t>(input),
+            tflite::micro::GetTensorData<int8_t>(output));
+        return kTfLiteOk;
+      }
+      default:
+        MicroPrintf("Input %s, output %s not supported.",
+                    TfLiteTypeGetName(input->type),
+                    TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else {
+    // TODO(b/141211002): Also support other data types once we have supported
+    // temporary tensors in TFLM.
+    MicroPrintf("Input %s, output %s not supported.",
+                TfLiteTypeGetName(input->type),
+                TfLiteTypeGetName(output->type));
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TFLMRegistration Register_LOGISTIC() {
+  return tflite::micro::RegisterOp(LogisticInit, LogisticPrepare, LogisticEval);
+}
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/kelvin/mul.cc b/tensorflow/lite/micro/kernels/kelvin/mul.cc
new file mode 100644
index 0000000..e006e9b
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/kelvin/mul.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/mul.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tflm/opt/opt.h"
+
+namespace tflite {
+
+TfLiteStatus MulEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataMul* data = static_cast<const OpDataMul*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kMulInput1Tensor);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kMulInput2Tensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kMulOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    EvalMulFloatReference(context, node, params, data, input1, input2, output);
+  } else if (output->type == kTfLiteInt32) {
+    EvalMulQuantizedReference(context, node, data, input1, input2, output);
+  } else if (output->type == kTfLiteInt16) {
+    tflite::ArithmeticParams op_params = {};
+    op_params.quantized_activation_min = data->output_activation_min;
+    op_params.quantized_activation_max = data->output_activation_max;
+    op_params.input1_offset = -data->input1_zero_point;
+    op_params.input2_offset = -data->input2_zero_point;
+    op_params.output_offset = data->output_zero_point;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+
+    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
+
+    if (need_broadcast) {
+      reference_integer_ops::BroadcastMul4DSlow(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<int16_t>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<int16_t>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int16_t>(output));
+    } else {
+      kelvin::opt::MulS16(op_params, tflite::micro::GetTensorShape(input1),
+                          tflite::micro::GetTensorData<int16_t>(input1),
+                          tflite::micro::GetTensorShape(input2),
+                          tflite::micro::GetTensorData<int16_t>(input2),
+                          tflite::micro::GetTensorShape(output),
+                          tflite::micro::GetTensorData<int16_t>(output));
+    }
+  } else if (output->type == kTfLiteInt8) {
+    tflite::ArithmeticParams op_params = {};
+    op_params.quantized_activation_min = data->output_activation_min;
+    op_params.quantized_activation_max = data->output_activation_max;
+    op_params.input1_offset = -data->input1_zero_point;
+    op_params.input2_offset = -data->input2_zero_point;
+    op_params.output_offset = data->output_zero_point;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+
+    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
+    if (need_broadcast) {
+      reference_integer_ops::BroadcastMul4DSlow(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<int8_t>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<int8_t>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+    } else {
+      kelvin::opt::MulS8(op_params, tflite::micro::GetTensorShape(input1),
+                         tflite::micro::GetTensorData<int8_t>(input1),
+                         tflite::micro::GetTensorShape(input2),
+                         tflite::micro::GetTensorData<int8_t>(input2),
+                         tflite::micro::GetTensorShape(output),
+                         tflite::micro::GetTensorData<int8_t>(output));
+    }
+  } else {
+    MicroPrintf("Unsupported output type: %s", TfLiteTypeGetName(output->type));
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+TFLMRegistration Register_MUL() {
+  return tflite::micro::RegisterOp(MulInit, MulPrepare, MulEval);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/kelvin/pooling.cc b/tensorflow/lite/micro/kernels/kelvin/pooling.cc
new file mode 100644
index 0000000..94fc6f2
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/kelvin/pooling.cc
@@ -0,0 +1,129 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/pooling.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tflm/opt/opt.h"
+
+namespace tflite {
+
+namespace {
+
+TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataPooling* data =
+      static_cast<const OpDataPooling*>(node->user_data);
+
+  const TfLiteEvalTensor* input =
+      micro::GetEvalInput(context, node, kPoolingInputTensor);
+  TfLiteEvalTensor* output =
+      micro::GetEvalOutput(context, node, kPoolingOutputTensor);
+
+  // Inputs and outputs share the same type, guaranteed by the converter.
+  switch (input->type) {
+    case kTfLiteFloat32:
+      AveragePoolingEvalFloat(context, node, params, data, input, output);
+      break;
+    case kTfLiteInt8:
+      AveragePoolingEvalQuantized<int8_t>(context, node, params, data, input,
+                                          output);
+      break;
+    case kTfLiteInt16:
+      AveragePoolingEvalQuantized<int16_t>(context, node, params, data, input,
+                                           output);
+      break;
+    default:
+      MicroPrintf("Input type %s is not currently supported",
+                  TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataPooling* data =
+      static_cast<const OpDataPooling*>(node->user_data);
+
+  const TfLiteEvalTensor* input =
+      micro::GetEvalInput(context, node, kPoolingInputTensor);
+  TfLiteEvalTensor* output =
+      micro::GetEvalOutput(context, node, kPoolingOutputTensor);
+
+  tflite::PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = data->activation_min;
+  op_params.quantized_activation_max = data->activation_max;
+  op_params.float_activation_min = data->activation_min_f32;
+  op_params.float_activation_max = data->activation_max_f32;
+
+  switch (input->type) {
+    case kTfLiteFloat32:
+      reference_ops::MaxPool(op_params, tflite::micro::GetTensorShape(input),
+                             tflite::micro::GetTensorData<float>(input),
+                             tflite::micro::GetTensorShape(output),
+                             tflite::micro::GetTensorData<float>(output));
+      break;
+    case kTfLiteInt8:
+      kelvin::opt::MaxPoolS8(
+          op_params, tflite::micro::GetTensorShape(input), input->data.int8,
+          tflite::micro::GetTensorShape(output), output->data.int8);
+      break;
+    case kTfLiteInt16:
+      kelvin::opt::MaxPoolS16(
+          op_params, tflite::micro::GetTensorShape(input), input->data.i16,
+          tflite::micro::GetTensorShape(output), output->data.i16);
+      break;
+    default:
+      MicroPrintf("Type %s not currently supported.",
+                  TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpDataPooling));
+}
+
+}  // namespace
+
+TFLMRegistration Register_AVERAGE_POOL_2D() {
+  return tflite::micro::RegisterOp(Init, PoolingPrepare, AverageEval);
+}
+
+TFLMRegistration Register_MAX_POOL_2D() {
+  return tflite::micro::RegisterOp(Init, PoolingPrepare, MaxEval);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/kelvin/reshape.cc b/tensorflow/lite/micro/kernels/kelvin/reshape.cc
new file mode 100644
index 0000000..76e3e52
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/kelvin/reshape.cc
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+#include "tflm/opt/opt.h"
+
+namespace tflite {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus ReshapeOutput(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
+  // Tensorflow's Reshape allows one of the shape components to have the
+  // special -1 value, meaning it will be calculated automatically based on the
+  // input. Here we calculate what that dimension should be so that the number
+  // of output elements in the same as the number of input elements.
+  int num_input_elements = NumElements(input);
+  TfLiteIntArray* output_shape = output->dims;
+
+  if (NumInputs(node) == 1 &&  // Legacy scalar supported with params.
+      output_shape->size == 1 && output_shape->data[0] == 0) {
+    // Legacy tflite models use a shape parameter of [0] to indicate scalars,
+    // so adjust accordingly. TODO(b/111614235): Allow zero-sized buffers during
+    // toco conversion.
+    output_shape->size = 0;
+  }
+
+  int num_output_elements = 1;
+  int stretch_dim = -1;
+  for (int i = 0; i < output_shape->size; ++i) {
+    int value = output_shape->data[i];
+    if (value == -1) {
+      TF_LITE_ENSURE_EQ(context, stretch_dim, -1);
+      stretch_dim = i;
+    } else {
+      num_output_elements *= value;
+    }
+  }
+  if (stretch_dim != -1) {
+    TfLiteEvalTensor* output_eval =
+        tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+    TF_LITE_ENSURE_STATUS(tflite::micro::CreateWritableTensorDimsWithCopy(
+        context, output, output_eval));
+    output_shape = output->dims;  // output tensor dims were moved
+    output_shape->data[stretch_dim] = num_input_elements / num_output_elements;
+    num_output_elements *= output_shape->data[stretch_dim];
+  }
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements);
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(output);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, ReshapeOutput(context, node), kTfLiteOk);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  // TODO(b/162522304): storing input bytes in OpData increases some models
+  // significantly, possibly due to alignment issues.
+  size_t input_bytes;
+  TF_LITE_ENSURE_STATUS(TfLiteTypeSizeOf(input->type, &input_bytes));
+  input_bytes *= ElementCount(*input->dims);
+
+  // Do nothing for in-place reshape.
+  if (input->data.raw != output->data.raw) {
+    // Otherwise perform reshape with copy.
+    kelvin::opt::Memcpy(output->data.raw, input->data.raw, input_bytes);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TFLMRegistration Register_RESHAPE() {
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/kelvin/resize_nearest_neighbor.cc b/tensorflow/lite/micro/kernels/kelvin/resize_nearest_neighbor.cc
new file mode 100644
index 0000000..5b700ae
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/kelvin/resize_nearest_neighbor.cc
@@ -0,0 +1,124 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tflm/opt/opt.h"
+
+namespace tflite {
+
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kSizeTensor = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroContext* micro_context = GetMicroContext(context);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input =
+      micro_context->AllocateTempInputTensor(node, kInputTensor);
+  TfLiteTensor* size =
+      micro_context->AllocateTempInputTensor(node, kSizeTensor);
+  TfLiteTensor* output =
+      micro_context->AllocateTempOutputTensor(node, kOutputTensor);
+
+  // Our current implementations rely on the input being 4D,
+  // and the size being 1D tensor with exactly 2 elements.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(size), 1);
+  TF_LITE_ENSURE_EQ(context, size->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, size->dims->data[0], 2);
+
+  output->type = input->type;
+
+  if (!IsConstantTensor(size)) {
+    MicroPrintf("Dynamic tensors are unsupported in tfmicro.");
+    return kTfLiteError;
+  }
+
+  micro_context->DeallocateTempTfLiteTensor(input);
+  micro_context->DeallocateTempTfLiteTensor(size);
+  micro_context->DeallocateTempTfLiteTensor(output);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteResizeNearestNeighborParams*>(node->builtin_data);
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* size =
+      tflite::micro::GetEvalInput(context, node, kSizeTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  tflite::ResizeNearestNeighborParams op_params;
+  op_params.align_corners = params->align_corners;
+  op_params.half_pixel_centers = false;
+
+  if (output->type == kTfLiteFloat32) {
+    reference_ops::ResizeNearestNeighbor(
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int32_t>(input),
+        tflite::micro::GetTensorShape(size),
+        tflite::micro::GetTensorData<int32_t>(size),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int32_t>(output));
+  } else if (output->type == kTfLiteInt8) {
+    kelvin::opt::ResizeNearestNeighborS8(
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(size),
+        tflite::micro::GetTensorData<int32_t>(size),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
+  } else if (output->type == kTfLiteInt16) {
+    reference_ops::ResizeNearestNeighbor(
+        op_params, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int16_t>(input),
+        tflite::micro::GetTensorShape(size),
+        tflite::micro::GetTensorData<int32_t>(size),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int16_t>(output));
+  } else {
+    MicroPrintf("Output tensor type %s (%d) not supported.",
+                TfLiteTypeGetName(output->type), output->type);
+
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TFLMRegistration Register_RESIZE_NEAREST_NEIGHBOR() {
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/pack.cc b/tensorflow/lite/micro/kernels/pack.cc
index f254329..0cfd91b 100644
--- a/tensorflow/lite/micro/kernels/pack.cc
+++ b/tensorflow/lite/micro/kernels/pack.cc
@@ -85,6 +85,9 @@
       return PackImpl<int8_t>(context, node, output, data->values_count,
                               data->axis);
     }
+    case kTfLiteInt16: {
+      return PackImpl<int16_t>(context, node, output, data->values_count, data->axis);
+    }
     case kTfLiteInt32: {
       return PackImpl<int32_t>(context, node, output, data->values_count,
                                data->axis);
diff --git a/tensorflow/lite/micro/kernels/testdata/BUILD b/tensorflow/lite/micro/kernels/testdata/BUILD
index 0c7822d..c93bc7d 100644
--- a/tensorflow/lite/micro/kernels/testdata/BUILD
+++ b/tensorflow/lite/micro/kernels/testdata/BUILD
@@ -16,6 +16,7 @@
     name = "conv_test_data",
     srcs = ["conv_test_data.cc"],
     hdrs = ["conv_test_data.h"],
+    visibility = ["//visibility:public"],
     deps = ["//tensorflow/lite/c:common"],
 )
 
diff --git a/tensorflow/lite/micro/kernels/transpose.cc b/tensorflow/lite/micro/kernels/transpose.cc
index fd17e89..915def5 100644
--- a/tensorflow/lite/micro/kernels/transpose.cc
+++ b/tensorflow/lite/micro/kernels/transpose.cc
@@ -97,6 +97,12 @@
                                tflite::micro::GetTensorShape(output),
                                tflite::micro::GetTensorData<float>(output));
       break;
+    case kTfLiteInt16:
+      reference_ops::Transpose(params, tflite::micro::GetTensorShape(input),
+          tflite::micro::GetTensorData<int16_t>(input),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int16_t>(output));
+      break;
     case kTfLiteInt8:
       reference_ops::Transpose(params, tflite::micro::GetTensorShape(input),
                                tflite::micro::GetTensorData<int8_t>(input),
diff --git a/tensorflow/lite/micro/micro_time.cc b/tensorflow/lite/micro/micro_time.cc
index 2d74fdb..d543820 100644
--- a/tensorflow/lite/micro/micro_time.cc
+++ b/tensorflow/lite/micro/micro_time.cc
@@ -26,6 +26,8 @@
 
 #include "tensorflow/lite/micro/micro_time.h"
 
+#include "benchmarks/cycle_count.h"
+
 #if defined(TF_LITE_USE_CTIME)
 #include <ctime>
 #endif
@@ -34,17 +36,10 @@
 
 #if !defined(TF_LITE_USE_CTIME)
 
-// Reference implementation of the ticks_per_second() function that's required
-// for a platform to support Tensorflow Lite for Microcontrollers profiling.
-// This returns 0 by default because timing is an optional feature that builds
-// without errors on platforms that do not need it.
+// Currently disable ticks_per_second as it won't work for simulator targets.
 uint32_t ticks_per_second() { return 0; }
 
-// Reference implementation of the GetCurrentTimeTicks() function that's
-// required for a platform to support Tensorflow Lite for Microcontrollers
-// profiling. This returns 0 by default because timing is an optional feature
-// that builds without errors on platforms that do not need it.
-uint32_t GetCurrentTimeTicks() { return 0; }
+uint32_t GetCurrentTimeTicks() { return static_cast<uint32_t>(mcycle_read()); }
 
 #else  // defined(TF_LITE_USE_CTIME)
 
diff --git a/tensorflow/lite/micro/tools/BUILD b/tensorflow/lite/micro/tools/BUILD
index a85a7ba..e3c6f0c 100644
--- a/tensorflow/lite/micro/tools/BUILD
+++ b/tensorflow/lite/micro/tools/BUILD
@@ -5,7 +5,7 @@
 load("//tensorflow:extra_rules.bzl", "tflm_application_friends")
 
 package(
-    default_visibility = ["//:__subpackages__"],
+    default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
 
diff --git a/tensorflow/lite/micro/tools/generate_cc_arrays.py b/tensorflow/lite/micro/tools/generate_cc_arrays.py
index 16d72c1..2a77b4d 100644
--- a/tensorflow/lite/micro/tools/generate_cc_arrays.py
+++ b/tensorflow/lite/micro/tools/generate_cc_arrays.py
@@ -92,9 +92,12 @@
     data_1d = data.flatten()
     out_string = ','.join([str(x) for x in data_1d])
     return [len(data_1d), out_string]
-
   else:
-    raise ValueError('input file must be .tflite, .bmp, .wav or .csv')
+    with open(input_fname, 'rb') as input_file:
+      buffer = input_file.read()
+    size = len(buffer)
+    out_string = bytes_to_hexstring(buffer)
+    return [size, out_string]
 
 
 def get_array_name(input_fname):
@@ -117,6 +120,8 @@
     return [base_array_name + '_test_data', 'float']
   elif input_fname.endswith('npy'):
     return [base_array_name + '_test_data', 'float']
+  else:
+    return [base_array_name, 'unsigned char']
 
 
 def main():
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b799523..597dc14 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -35,6 +35,8 @@
             "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/archive/fda83bdc38b118cc6b56753bd540caa49e570745.zip",
             "https://github.com/google/gemmlowp/archive/fda83bdc38b118cc6b56753bd540caa49e570745.zip",
         ],
+        patch_file =
+            "@tflite-micro//third_party/gemmlowp:pthread.patch",
     )
 
     tf_http_archive(
diff --git a/third_party/gemmlowp/BUILD b/third_party/gemmlowp/BUILD
new file mode 100644
index 0000000..c9776cc
--- /dev/null
+++ b/third_party/gemmlowp/BUILD
@@ -0,0 +1,6 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+exports_files(glob(["*.patch"]))
diff --git a/third_party/gemmlowp/pthread.patch b/third_party/gemmlowp/pthread.patch
new file mode 100644
index 0000000..547dd52
--- /dev/null
+++ b/third_party/gemmlowp/pthread.patch
@@ -0,0 +1,13 @@
+diff --git a/flags.bzl b/flags.bzl
+index e35fe9e..e26a448 100644
+--- a/flags.bzl
++++ b/flags.bzl
+@@ -4,7 +4,7 @@ LIB_COPTS = []
+ LIB_LINKOPTS = select({
+     ":android": [],
+     ":windows": [],
+-    "//conditions:default": ["-lpthread"],
++    "//conditions:default": [],
+ })
+ 
+ BIN_LINKOPTS = LIB_LINKOPTS
\ No newline at end of file
diff --git a/third_party/ruy/BUILD b/third_party/ruy/BUILD
index 518fea8..8fabe49 100644
--- a/third_party/ruy/BUILD
+++ b/third_party/ruy/BUILD
@@ -4,3 +4,5 @@
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
+
+exports_files(glob(["*.patch"]))
diff --git a/third_party/ruy/pthread.patch b/third_party/ruy/pthread.patch
new file mode 100644
index 0000000..c8ddf4d
--- /dev/null
+++ b/third_party/ruy/pthread.patch
@@ -0,0 +1,11 @@
+diff --git a/ruy/build_defs.oss.bzl b/ruy/build_defs.oss.bzl
+index e405b41..1d7612b 100644
+--- a/ruy/build_defs.oss.bzl
++++ b/ruy/build_defs.oss.bzl
+@@ -11,5 +11,5 @@ def ruy_linkopts_thread_standard_library():
+     # https://github.com/abseil/abseil-cpp/blob/1112609635037a32435de7aa70a9188dcb591458/absl/base/BUILD.bazel#L155
+     return select({
+         "@bazel_tools//src/conditions:windows": [],
+-        "//conditions:default": ["-pthread"],
++        "//conditions:default": [],
+     })
\ No newline at end of file
diff --git a/third_party/ruy/workspace.bzl b/third_party/ruy/workspace.bzl
index 5076962..1671ab5 100644
--- a/third_party/ruy/workspace.bzl
+++ b/third_party/ruy/workspace.bzl
@@ -12,4 +12,5 @@
             "https://github.com/google/ruy/archive/54774a7a2cf85963777289193629d4bd42de4a59.zip",
         ],
         build_file = "//third_party/ruy:BUILD",
+        patch_file = "@tflite-micro//third_party/ruy:pthread.patch",
     )