CMSIS-NN LSTM issue fixes (#2504)

- Remove read of non-initialized buffer
- Adds back integer unidirectional_sequence_lstm_test for CMSIS-NN
- Adds back dtln example using CMSIS-NN
- Reverts "Add intermediate tensors to LSTM unit test to enable CMSIS-NN (#1996)"

BUG=#2449
diff --git a/tensorflow/lite/micro/kernels/cmsis_nn/unidirectional_sequence_lstm.cc b/tensorflow/lite/micro/kernels/cmsis_nn/unidirectional_sequence_lstm.cc
index 27e31f5..75ba5ea 100644
--- a/tensorflow/lite/micro/kernels/cmsis_nn/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/micro/kernels/cmsis_nn/unidirectional_sequence_lstm.cc
@@ -37,20 +37,41 @@
   cmsis_nn_lstm_params params_cmsis_nn;  // Used for  CMSIS-NN implementation
 };
 
-TfLiteStatus PortOpData_s8(TfLiteContext* context, OpDataLSTM* params_ref,
-                           const LSTMKernelContents& kernel_content,
-                           cmsis_nn_lstm_params* params_cmsis_nn) {
+LSTMBuffers<int16_t> CMSIS_NN_CreateLSTMBuffers(TfLiteContext* context,
+                                                const int* buffer_indices) {
+  LSTMBuffers<int16_t> buffers;
+  buffers.buffer0 = reinterpret_cast<int16_t*>(
+      context->GetScratchBuffer(context, buffer_indices[0]));
+  buffers.buffer1 = reinterpret_cast<int16_t*>(
+      context->GetScratchBuffer(context, buffer_indices[1]));
+  buffers.buffer2 = reinterpret_cast<int16_t*>(
+      context->GetScratchBuffer(context, buffer_indices[2]));
+
+  return buffers;
+}
+
+void CMSIS_NN_VectorSum(int32_t* kernel_sum, const int32_t size1,
+                        const int32_t size2, const int8_t* weights,
+                        const int32_t offset, const int32_t* biases) {
+  arm_vector_sum_s8(kernel_sum, size1, size2, weights, offset, biases);
+}
+
+template <typename BiasType>
+TfLiteStatus CMSIS_NN_PortOpData(TfLiteContext* context, OpDataLSTM* params_ref,
+                                 const LSTMKernelContents& kernel_content,
+                                 cmsis_nn_lstm_params* params_cmsis_nn) {
   // Unwrap pointers
-  const int32_t* input_gate_bias =
-      tflite::micro::GetOptionalTensorData<int32_t>(
+  const BiasType* input_gate_bias =
+      tflite::micro::GetOptionalTensorData<BiasType>(
           kernel_content.GetInternalTensor(tflite::kLstmInputGateBiasTensor));
-  const int32_t* forget_gate_bias =
-      tflite::micro::GetOptionalTensorData<int32_t>(
+  const BiasType* forget_gate_bias =
+      tflite::micro::GetOptionalTensorData<BiasType>(
           kernel_content.GetInternalTensor(tflite::kLstmForgetGateBiasTensor));
-  const int32_t* cell_gate_bias = tflite::micro::GetOptionalTensorData<int32_t>(
-      kernel_content.GetInternalTensor(tflite::kLstmCellGateBiasTensor));
-  const int32_t* output_gate_bias =
-      tflite::micro::GetOptionalTensorData<int32_t>(
+  const BiasType* cell_gate_bias =
+      tflite::micro::GetOptionalTensorData<BiasType>(
+          kernel_content.GetInternalTensor(tflite::kLstmCellGateBiasTensor));
+  const BiasType* output_gate_bias =
+      tflite::micro::GetOptionalTensorData<BiasType>(
           kernel_content.GetInternalTensor(tflite::kLstmOutputGateBiasTensor));
 
   const int8_t* input_to_input_weights =
@@ -90,72 +111,72 @@
   int32_t size_data = params_ref->size_info.input_dimension;
   int32_t size_hidden = params_ref->size_info.state_dimension;
 
-  int32_t* input_data_kernel_sum{
-      static_cast<int32_t*>(context->AllocatePersistentBuffer(
-          context, size_hidden * sizeof(int32_t)))};
-  int32_t* forget_data_kernel_sum{
-      static_cast<int32_t*>(context->AllocatePersistentBuffer(
-          context, size_hidden * sizeof(int32_t)))};
-  int32_t* cell_data_kernel_sum{
-      static_cast<int32_t*>(context->AllocatePersistentBuffer(
-          context, size_hidden * sizeof(int32_t)))};
-  int32_t* output_data_kernel_sum{
-      static_cast<int32_t*>(context->AllocatePersistentBuffer(
-          context, size_hidden * sizeof(int32_t)))};
+  BiasType* input_data_kernel_sum{
+      static_cast<BiasType*>(context->AllocatePersistentBuffer(
+          context, size_hidden * sizeof(BiasType)))};
+  BiasType* forget_data_kernel_sum{
+      static_cast<BiasType*>(context->AllocatePersistentBuffer(
+          context, size_hidden * sizeof(BiasType)))};
+  BiasType* cell_data_kernel_sum{
+      static_cast<BiasType*>(context->AllocatePersistentBuffer(
+          context, size_hidden * sizeof(BiasType)))};
+  BiasType* output_data_kernel_sum{
+      static_cast<BiasType*>(context->AllocatePersistentBuffer(
+          context, size_hidden * sizeof(BiasType)))};
 
-  int32_t* input_hidden_kernel_sum{
-      static_cast<int32_t*>(context->AllocatePersistentBuffer(
-          context, size_hidden * sizeof(int32_t)))};
-  int32_t* forget_hidden_kernel_sum{
-      static_cast<int32_t*>(context->AllocatePersistentBuffer(
-          context, size_hidden * sizeof(int32_t)))};
-  int32_t* cell_hidden_kernel_sum = {
-      static_cast<int32_t*>(context->AllocatePersistentBuffer(
-          context, size_hidden * sizeof(int32_t)))};
-  int32_t* output_hidden_kernel_sum = {
-      static_cast<int32_t*>(context->AllocatePersistentBuffer(
-          context, size_hidden * sizeof(int32_t)))};
+  BiasType* input_hidden_kernel_sum{
+      static_cast<BiasType*>(context->AllocatePersistentBuffer(
+          context, size_hidden * sizeof(BiasType)))};
+  BiasType* forget_hidden_kernel_sum{
+      static_cast<BiasType*>(context->AllocatePersistentBuffer(
+          context, size_hidden * sizeof(BiasType)))};
+  BiasType* cell_hidden_kernel_sum = {
+      static_cast<BiasType*>(context->AllocatePersistentBuffer(
+          context, size_hidden * sizeof(BiasType)))};
+  BiasType* output_hidden_kernel_sum = {
+      static_cast<BiasType*>(context->AllocatePersistentBuffer(
+          context, size_hidden * sizeof(BiasType)))};
 
   // Compute effective biases
-  arm_vector_sum_s8(
+  CMSIS_NN_VectorSum(
       input_data_kernel_sum, size_data, size_hidden, input_to_input_weights,
       params_ref->input_gate_parameters.input_fc_params.input_offset,
       input_gate_bias);
 
-  arm_vector_sum_s8(
+  CMSIS_NN_VectorSum(
       forget_data_kernel_sum, size_data, size_hidden, input_to_forget_weights,
       params_ref->forget_gate_parameters.input_fc_params.input_offset,
       forget_gate_bias);
 
-  arm_vector_sum_s8(
+  CMSIS_NN_VectorSum(
       cell_data_kernel_sum, size_data, size_hidden, input_to_cell_weights,
       params_ref->cell_gate_parameters.input_fc_params.input_offset,
       cell_gate_bias);
 
-  arm_vector_sum_s8(
+  CMSIS_NN_VectorSum(
       output_data_kernel_sum, size_data, size_hidden, input_to_output_weights,
       params_ref->output_gate_parameters.input_fc_params.input_offset,
       output_gate_bias);
 
-  arm_vector_sum_s8(
+  CMSIS_NN_VectorSum(
       input_hidden_kernel_sum, size_hidden, size_hidden,
       recurrent_to_input_weights,
       -params_ref->inter_gate_parameters.output_mul_params.output_offset,
       nullptr);
 
-  arm_vector_sum_s8(
+  CMSIS_NN_VectorSum(
       forget_hidden_kernel_sum, size_hidden, size_hidden,
       recurrent_to_forget_weights,
       -params_ref->inter_gate_parameters.output_mul_params.output_offset,
       nullptr);
 
-  arm_vector_sum_s8(
+  CMSIS_NN_VectorSum(
       cell_hidden_kernel_sum, size_hidden, size_hidden,
       recurrent_to_cell_weights,
       -params_ref->inter_gate_parameters.output_mul_params.output_offset,
       nullptr);
 
-  arm_vector_sum_s8(
+  CMSIS_NN_VectorSum(
       output_hidden_kernel_sum, size_hidden, size_hidden,
       recurrent_to_output_weights,
       -params_ref->inter_gate_parameters.output_mul_params.output_offset,
@@ -242,10 +263,9 @@
   return kTfLiteOk;
 }
 
-template <typename CellType>
 TfLiteStatus CMSIS_NN_EvalInteger8x8_16Lstm(
     const OpData& op_data, const LSTMKernelContents& kernel_content,
-    const LSTMBuffers<CellType>& buffers) {
+    const LSTMBuffers<int16_t>& buffers) {
   TFLITE_DCHECK(
       kernel_content.GetInternalTensor(tflite::kLstmInputTensor)->dims->size >=
           2 &&
@@ -270,7 +290,6 @@
 }
 
 /*Kernel functions*/
-
 void* UnidirectionalSequenceLstmInit(TfLiteContext* context, const char* buffer,
                                      size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
@@ -327,20 +346,15 @@
   }
 
   size_t number_of_buffers;
-  if (activation_type != kTfLiteInt8) {
-    number_of_buffers = 4;
+  if (activation_type == kTfLiteInt8 && cell_state_type == kTfLiteInt16) {
+    auto kernel_content = CreateLSTMKernelContent(context, node);
+    number_of_buffers = 3;
+    CMSIS_NN_PortOpData<int32_t>(context, op_data_lstm, kernel_content,
+                                 &op_data->params_cmsis_nn);
   } else {
-    bool cmsis_nn_used = (cell_state_type == kTfLiteInt16);
-    if (cmsis_nn_used) {
-      auto kernel_content = CreateLSTMKernelContent(context, node);
-      PortOpData_s8(context, op_data_lstm, kernel_content,
-                    &op_data->params_cmsis_nn);
-
-      number_of_buffers = 3;
-    } else {
-      number_of_buffers = 4;
-    }
+    number_of_buffers = 4;
   }
+
   for (size_t i = 0; i < number_of_buffers; i++) {
     TF_LITE_ENSURE_OK(context, context->RequestScratchBufferInArena(
                                    context,
@@ -379,9 +393,9 @@
         case kTfLiteInt8: {
           // 8(activation)x8(weight)->16(cell) LSTM with 32 bits bias
           LSTMBuffers<int16_t> buffers =
-              CreateLSTMBuffers<int16_t>(context, op_data_lstm.buffer_indices);
-          return CMSIS_NN_EvalInteger8x8_16Lstm<int16_t>(
-              op_data, kernel_content, buffers);
+              CMSIS_NN_CreateLSTMBuffers(context, op_data_lstm.buffer_indices);
+          return CMSIS_NN_EvalInteger8x8_16Lstm(op_data, kernel_content,
+                                                buffers);
           break;
         }
         default: {
@@ -435,10 +449,9 @@
 
   if (activation_type == kTfLiteInt8) {
     LSTMBuffers<int16_t> buffers =
-        CreateLSTMBuffers<int16_t>(context, op_data_lstm.buffer_indices);
+        CMSIS_NN_CreateLSTMBuffers(context, op_data_lstm.buffer_indices);
 
-    return CMSIS_NN_EvalInteger8x8_16Lstm<int16_t>(op_data, kernel_content,
-                                                   buffers);
+    return CMSIS_NN_EvalInteger8x8_16Lstm(op_data, kernel_content, buffers);
   } else {
     MicroPrintf("Input type %s (%d) not supported.",
                 TfLiteTypeGetName(activation_type), activation_type);
diff --git a/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm_test.cc
index 06e8e73..1e5a868 100644
--- a/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/micro/kernels/unidirectional_sequence_lstm_test.cc
@@ -28,7 +28,6 @@
 namespace {
 
 constexpr int kLstmMaxNumInputOutputTensors = 24 + 1;
-constexpr int kLstmIntermediateTensorBase = kLstmMaxNumInputOutputTensors + 1;
 
 // Validate the output result array with golden values
 template <typename T>
@@ -50,42 +49,20 @@
     LstmNodeContent<ActivationType, WeightType, BiasType, CellType, batch_size,
                     time_steps, input_dimension, state_dimension>&
         node_contents) {
-  TfLiteTensor tensors[kLstmMaxNumInputOutputTensors + 1 + 5];
-  memcpy(tensors, node_contents.GetTensors(),
-         kLstmMaxNumInputOutputTensors * sizeof(TfLiteTensor));
-
-  // Provide also intermediate tensors needed by older LSTM implementations
-  int intermediate_array_data[6] = {5,
-                                    kLstmIntermediateTensorBase,
-                                    kLstmIntermediateTensorBase + 1,
-                                    kLstmIntermediateTensorBase + 2,
-                                    kLstmIntermediateTensorBase + 3,
-                                    kLstmIntermediateTensorBase + 4};
-  int input_zero_points[2] = {1, -21};
-  float input_scales[2] = {1, 0.004705882165580988};
-  TfLiteAffineQuantization input_quant = {
-      tflite::testing::FloatArrayFromFloats(input_scales),
-      tflite::testing::IntArrayFromInts(input_zero_points), 0};
-  int intermediate_dim[2] = {1, 0};
-  for (int i = 0; i < 5; ++i) {
-    tensors[kLstmIntermediateTensorBase + i] =
-        CreateTensor<int16_t>(nullptr, IntArrayFromInts(intermediate_dim));
-    tensors[kLstmIntermediateTensorBase + i].quantization = {
-        kTfLiteAffineQuantization, &input_quant};
-  }
-
   const TFLMRegistration registration = Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
   auto buildin_data = node_contents.BuiltinData();
   micro::KernelRunner runner(
-      registration, tensors, kLstmMaxNumInputOutputTensors + 1 + 5,
+      registration, node_contents.GetTensors(), kLstmMaxNumInputOutputTensors,
       node_contents.KernelInputs(), node_contents.KernelOutputs(),
-      reinterpret_cast<void*>(&buildin_data),
-      IntArrayFromInts(intermediate_array_data));
+      reinterpret_cast<void*>(&buildin_data));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
   const auto& quantization_settings = node_contents.QuantizationSettings();
 
+// CMSIS-NN does not use the hidden state and cell state tensors so these tests
+// fail.
+#if !defined(CMSIS_NN)
   float dequantized_hidden_state[batch_size * state_dimension] = {};
   Dequantize(node_contents.GetHiddenStateData(), batch_size * state_dimension,
              quantization_settings.hidden_state.scale,
@@ -104,6 +81,7 @@
   ValidateResultGoldens(eval_check_data.expected_cell_state,
                         dequantized_cell_state, batch_size * state_dimension,
                         cell_state_tolerance);
+#endif
 
   float dequantized_output[batch_size * state_dimension * time_steps] = {};
   Dequantize(node_contents.GetOutputData(),
@@ -162,9 +140,6 @@
                                                tolerance, float_node_contents);
 }
 
-// TODO(#2249) Unidirectional_sequence_lstm_test fails for new CMSIS-NN lstm
-// implementation
-#if !defined(CMSIS_NN)
 TF_LITE_MICRO_TEST(TestUnidirectionalLSTMInt8) {
   const tflite::testing::LstmEvalCheckData<12, 4, 12> kernel_eval_data =
       tflite::testing::Get2X2LstmEvalCheckData();
@@ -179,7 +154,6 @@
       kernel_eval_data, hidden_state_tolerance, cell_state_tolerance,
       int8_node_contents);
 }
-#endif
 
 TF_LITE_MICRO_TEST(TestUnidirectionalLSTMInt16) {
   const tflite::testing::LstmEvalCheckData<12, 4, 12> kernel_eval_data =
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn_download.sh b/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn_download.sh
index 601c4d4..fae77aa 100755
--- a/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn_download.sh
+++ b/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn_download.sh
@@ -47,9 +47,9 @@
   echo >&2 "${DOWNLOADED_CMSIS_NN_PATH} already exists, skipping the download."
 else
 
-  ZIP_PREFIX_NN="72e1ebf623ab1660a3e14e4e36fdcddce46f1991"
-  CMSIS_NN_URL="http://github.com/ARM-software/CMSIS-NN/archive/${ZIP_PREFIX_NN}.zip"
-  CMSIS_NN_MD5="23a623f4eca6c8f11ee5366c2cf61a44"
+  ZIP_PREFIX_NN="6cc31fb36fa330325b2bb0ffde3a7288384e58ab"
+  CMSIS_NN_URL="http://github.com/ARM-software/CMSIS-NN/archive/6cc31fb36fa330325b2bb0ffde3a7288384e58ab.zip"
+  CMSIS_NN_MD5="42000f264b93b7b6cd60c1b507792daf"
 
   # wget is much faster than git clone of the entire repo. So we wget a specific
   # version and can then apply a patch, as needed.
diff --git a/tensorflow/lite/micro/tools/make/targets/cortex_m_corstone_300_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex_m_corstone_300_makefile.inc
index 653afad..64d0662 100644
--- a/tensorflow/lite/micro/tools/make/targets/cortex_m_corstone_300_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/cortex_m_corstone_300_makefile.inc
@@ -211,9 +211,4 @@
 endif
 MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
 
-# TODO(#2449) Examine why this test fails here.
-EXCLUDED_EXAMPLE_TESTS := \
-    tensorflow/lite/micro/examples/dtln/Makefile.inc
-MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
-
 TEST_SCRIPT := $(TENSORFLOW_ROOT)tensorflow/lite/micro/testing/test_with_arm_corstone_300.sh