tflite-micro: add elementwise multiplication kernel for Kelvin

Change-Id: I14946324e5a460f0c345b8c5528f577310ee532f
diff --git a/tensorflow/lite/micro/kernels/kelvin/mul.cc b/tensorflow/lite/micro/kernels/kelvin/mul.cc
new file mode 100644
index 0000000..e006e9b
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/kelvin/mul.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/mul.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tflm/opt/opt.h"
+
+namespace tflite {
+
+TfLiteStatus MulEval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpDataMul* data = static_cast<const OpDataMul*>(node->user_data);
+
+  const TfLiteEvalTensor* input1 =
+      tflite::micro::GetEvalInput(context, node, kMulInput1Tensor);
+  const TfLiteEvalTensor* input2 =
+      tflite::micro::GetEvalInput(context, node, kMulInput2Tensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kMulOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    EvalMulFloatReference(context, node, params, data, input1, input2, output);
+  } else if (output->type == kTfLiteInt32) {
+    EvalMulQuantizedReference(context, node, data, input1, input2, output);
+  } else if (output->type == kTfLiteInt16) {
+    tflite::ArithmeticParams op_params = {};
+    op_params.quantized_activation_min = data->output_activation_min;
+    op_params.quantized_activation_max = data->output_activation_max;
+    op_params.input1_offset = -data->input1_zero_point;
+    op_params.input2_offset = -data->input2_zero_point;
+    op_params.output_offset = data->output_zero_point;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+
+    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
+
+    if (need_broadcast) {
+      reference_integer_ops::BroadcastMul4DSlow(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<int16_t>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<int16_t>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int16_t>(output));
+    } else {
+      kelvin::opt::MulS16(op_params, tflite::micro::GetTensorShape(input1),
+                          tflite::micro::GetTensorData<int16_t>(input1),
+                          tflite::micro::GetTensorShape(input2),
+                          tflite::micro::GetTensorData<int16_t>(input2),
+                          tflite::micro::GetTensorShape(output),
+                          tflite::micro::GetTensorData<int16_t>(output));
+    }
+  } else if (output->type == kTfLiteInt8) {
+    tflite::ArithmeticParams op_params = {};
+    op_params.quantized_activation_min = data->output_activation_min;
+    op_params.quantized_activation_max = data->output_activation_max;
+    op_params.input1_offset = -data->input1_zero_point;
+    op_params.input2_offset = -data->input2_zero_point;
+    op_params.output_offset = data->output_zero_point;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+
+    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+        tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorShape(input2), &op_params);
+    if (need_broadcast) {
+      reference_integer_ops::BroadcastMul4DSlow(
+          op_params, tflite::micro::GetTensorShape(input1),
+          tflite::micro::GetTensorData<int8_t>(input1),
+          tflite::micro::GetTensorShape(input2),
+          tflite::micro::GetTensorData<int8_t>(input2),
+          tflite::micro::GetTensorShape(output),
+          tflite::micro::GetTensorData<int8_t>(output));
+    } else {
+      kelvin::opt::MulS8(op_params, tflite::micro::GetTensorShape(input1),
+                         tflite::micro::GetTensorData<int8_t>(input1),
+                         tflite::micro::GetTensorShape(input2),
+                         tflite::micro::GetTensorData<int8_t>(input2),
+                         tflite::micro::GetTensorShape(output),
+                         tflite::micro::GetTensorData<int8_t>(output));
+    }
+  } else {
+    MicroPrintf("Unsupported output type: %s", TfLiteTypeGetName(output->type));
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+TFLMRegistration Register_MUL() {
+  return tflite::micro::RegisterOp(MulInit, MulPrepare, MulEval);
+}
+
+}  // namespace tflite