Introduce a noisy benchmark list to control threshold and comment (#7016) These benchmarks seem to suffer from phone issues so they bounce in large ranges. So introducing a filter list to reduce the noise to avoid confusion while waiting for fixing the phones.

commit: 3d3f24b476deba89423421cc21fb787a9fbc33c1 [log] [tgz]
author: Lei Zhang <antiagainst@google.com> Fri Sep 10 15:41:51 2021 -0400
committer: GitHub <noreply@github.com> Fri Sep 10 15:41:51 2021 -0400
tree: e922c10d2d0746b4820d77ed1fe5e43a0c7ca5f6
parent: 664005aa366d81235d8886e3de24773932e086ed [diff]
diff --git a/build_tools/android/common/noisy_benchmarks.py b/build_tools/android/common/noisy_benchmarks.py
new file mode 100644
index 0000000..7fcb354
--- /dev/null
+++ b/build_tools/android/common/noisy_benchmarks.py

@@ -0,0 +1,24 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""A list of noisy benchmarks and their average thresholds."""
+
+import re
+
+# A list of noisy benchmarks. Each one is a tuple that contains the following
+# fields:
+# - A regular expression to match against the benchmark identifier.
+# - A threshold for computing the benchmark value average. Benchmark sample
+#   values from consecutive runs and within the given range will be considered
+#   as similar (with some noise). They will be used to compute the moving
+#   average. It is a string that sent to Dana as API call JSON payloadsdirectly.
+#   There are two formats supported: a percentage or an absolute value. So we
+#   use a string here. What value to set depends on the noise range of the
+#   particular benchmark.
+NOISY_BENCHMARKS = [
+    (re.compile(r"^PoseNet.*GPU-Mali-G77"), "100%"),
+    (re.compile(r"^DeepLabV3.*GPU-Mali-G77"), "100%"),
+    (re.compile(r"^MobileSSD.*GPU-Mali-G77"), "100%"),
+]

diff --git a/build_tools/android/post_benchmarks_as_pr_comment.py b/build_tools/android/post_benchmarks_as_pr_comment.py
index e47736b..e5c1475 100755
--- a/build_tools/android/post_benchmarks_as_pr_comment.py
+++ b/build_tools/android/post_benchmarks_as_pr_comment.py

@@ -44,6 +44,7 @@
 from typing import Any, Dict, Optional, Sequence, Tuple, Union
 
 from common.benchmark_description import BenchmarkResults, get_output
+from common.noisy_benchmarks import NOISY_BENCHMARKS
 
 ABBR_PR_COMMENT_TITLE = "Abbreviated Benchmark Summary"
 GITHUB_GIST_API_PREFIX = "https://api.github.com/gists"
@@ -126,7 +127,8 @@
 
 
 def aggregate_all_benchmarks(
-    benchmark_files: Sequence[str]) -> Dict[str, AggregateBenchmarkLatency]:
+    benchmark_files: Sequence[str],
+    verbose: bool = False) -> Dict[str, AggregateBenchmarkLatency]:
   """Aggregates all benchmarks in the given files.
 
   Args:
@@ -156,6 +158,12 @@
       if name in aggregate_results:
         raise ValueError(f"Duplicated benchmarks: {name}")
 
+      # Filter noisy benchmarks out.
+      if any([regex.match(name) is not None for regex, _ in NOISY_BENCHMARKS]):
+        if verbose:
+          print(f"Skipping noisy benchmark '{name}'")
+        continue
+
       # Now scan all benchmark iterations and find the aggregate results.
       mean_time = file_results.get_aggregate_time(benchmark_index, "mean")
       median_time = file_results.get_aggregate_time(benchmark_index, "median")
@@ -305,7 +313,7 @@
                                   query_base: bool,
                                   verbose: bool = False) -> Tuple[str, str]:
   """Gets the full/abbreviated markdown summary of all benchmarks in files."""
-  all_benchmarks = aggregate_all_benchmarks(benchmark_files)
+  all_benchmarks = aggregate_all_benchmarks(benchmark_files, verbose=verbose)
 
   build_url = get_required_env_var("BUILDKITE_BUILD_URL")
   pr_number = get_required_env_var("BUILDKITE_PULL_REQUEST")

diff --git a/build_tools/android/upload_benchmarks_to_dashboard.py b/build_tools/android/upload_benchmarks_to_dashboard.py
index 093ff02..63c136f 100755
--- a/build_tools/android/upload_benchmarks_to_dashboard.py
+++ b/build_tools/android/upload_benchmarks_to_dashboard.py

@@ -28,6 +28,7 @@
 
 from common.benchmark_description import (BenchmarkInfo, BenchmarkResults,
                                           get_output)
+from common.noisy_benchmarks import NOISY_BENCHMARKS
 
 IREE_GITHUB_COMMIT_URL_PREFIX = 'https://github.com/google/iree/commit'
 IREE_PROJECT_ID = 'IREE'
@@ -210,9 +211,18 @@
                         verbose: bool = False):
   """Posts a new series to the dashboard."""
   url = get_required_env_var('IREE_DASHBOARD_URL')
+
+  average_range = '5%'
+  # Adjust average threshold for noisy benchmarks.
+  for regex, threshold in NOISY_BENCHMARKS:
+    if regex.match(series_id):
+      average_range = threshold
+      break
+
   payload = compose_series_payload(IREE_PROJECT_ID,
                                    series_id,
                                    series_description,
+                                   average_range=average_range,
                                    override=override)
   post_to_dashboard(f'{url}/apis/addSerie',
                     payload,
commit	3d3f24b476deba89423421cc21fb787a9fbc33c1	[log] [tgz]
author	Lei Zhang <antiagainst@google.com>	Fri Sep 10 15:41:51 2021 -0400
committer	GitHub <noreply@github.com>	Fri Sep 10 15:41:51 2021 -0400
tree	e922c10d2d0746b4820d77ed1fe5e43a0c7ca5f6
parent	664005aa366d81235d8886e3de24773932e086ed [diff]