Measure MobileBERT fp16 full inference time for Mali GPUs (#7403)

We have configurations for fp32 kernel-execution and full-inference.
But for fp16 we only have kernel execution.
diff --git a/benchmarks/TensorFlow/CMakeLists.txt b/benchmarks/TensorFlow/CMakeLists.txt
index 6f95f86..48c79b3 100644
--- a/benchmarks/TensorFlow/CMakeLists.txt
+++ b/benchmarks/TensorFlow/CMakeLists.txt
@@ -270,6 +270,27 @@
     "--batch_size=32"
 )
 
+# GPU, Vulkan, Mali, full-inference
+iree_mlir_benchmark_suite(
+  MODULES
+    ${MOBILEBERT_FP16_MODULE}
+
+  BENCHMARK_MODES
+    "full-inference"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Mali-Valhall"
+  TRANSLATION_FLAGS
+    "--iree-input-type=mhlo"
+    "--iree-flow-demote-f32-to-f16"
+    "--iree-vulkan-target-triple=valhall-unknown-android11"
+    "--iree-flow-inline-constants-max-byte-length=16"
+    "--iree-enable-fusion-with-reduction-ops"
+  DRIVER
+    "vulkan"
+)
+
 ################################################################################
 #                                                                              #
 # Speical benchmark configurations                                             #