Abbreviate benchmark results posted to pull requests (#6124)

We are accumulating more and more benchmarks. Posting all of the
results to the pull request can be a daunting amount of raw data.
Instead, we should categorize and sort them and just provide
abbreviated tables, leaving the full tables somewhere else.

This commit adjusts the pull request comment script with above
improvements. The full benchmark tables are posted as Github Gists,
which is still on GitHub infrastructure and provides nice markdown
rendering and such.
diff --git a/build_tools/android/post_benchmarks_as_pr_comment.py b/build_tools/android/post_benchmarks_as_pr_comment.py
index 61314eb..fc9aa1a 100755
--- a/build_tools/android/post_benchmarks_as_pr_comment.py
+++ b/build_tools/android/post_benchmarks_as_pr_comment.py
@@ -9,10 +9,12 @@
 This script is meant to be used by Buildkite for automation. It requires the
 following environment to be set:
 
+- BUILDKITE_BUILD_NUMBER: the build number of current Buildkite build.
 - BUILDKITE_BUILD_URL: the link to the current Buildkite build.
 - BUILDKITE_COMMIT: the pull request HEAD commit.
 - BUILDKITE_PULL_REQUEST: the current pull request number.
-- GITHUB_TOKEN: personal access token to authenticate against GitHub API.
+- GITHUB_TOKEN: personal access token to authenticate against GitHub API;
+    it should have "public_repo" and "gist" scope.
 
 if --query-base in toggled on, then it additionally requires:
 
@@ -36,14 +38,29 @@
 import requests
 import markdown_strings as md
 
-from typing import Any, Dict, Sequence, Tuple, Union
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Sequence, Tuple, Union
 
 from common.benchmark_description import BenchmarkResults, get_output
 
+GITHUB_GIST_API_PREFIX = "https://api.github.com/gists"
 GITHUB_IREE_API_PREFIX = "https://api.github.com/repos/google/iree"
+GITHUB_IREE_REPO_PREFIX = "https://github.com/google/iree"
+GITHUB_USER = "iree-github-actions-bot"
 IREE_PROJECT_ID = 'IREE'
+# The ratio below which benchmarks will be considered as similar with base.
+SIMILAR_BECNHMARK_THRESHOLD = 0.05
+# The max number of rows to show per table.
+TABLE_SIZE_CUT = 3
 THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
-RESULT_EMPHASIS_THRESHOLD = 0.05
+
+
+def get_required_env_var(var: str) -> str:
+  """Gets the value for a required environment variable."""
+  value = os.getenv(var, None)
+  if value is None:
+    raise RuntimeError(f'Missing environment variable "{var}"')
+  return value
 
 
 def get_git_commit_hash(commit: str, verbose: bool = False) -> str:
@@ -61,12 +78,13 @@
   return int(count)
 
 
-def get_required_env_var(var: str) -> str:
-  """Gets the value for a required environment variable."""
-  value = os.getenv(var, None)
-  if value is None:
-    raise RuntimeError(f'Missing environment variable "{var}"')
-  return value
+def get_origin_tree_top_commit(verbose: bool = False) -> str:
+  """Returns the top of the tree commit for the origin base branch."""
+  base_branch = get_required_env_var("BUILDKITE_PULL_REQUEST_BASE_BRANCH")
+  get_output(['git', 'fetch', '--prune', '--', 'origin', base_branch],
+             cwd=THIS_DIRECTORY,
+             verbose=verbose)
+  return get_git_commit_hash(f'origin/{base_branch}', verbose)
 
 
 def get_from_dashboard(url: str,
@@ -84,11 +102,24 @@
     raise requests.RequestException(
         f'Failed to get from dashboard server with status code {code}')
 
-  return response.json()
+  data = response.json()
+  if verbose:
+    print(f'Queried base benchmark data: {data}')
+  return data
+
+
+@dataclass
+class AggregateBenchmarkLatency:
+  """An object for describing aggregate latency numbers for a benchmark."""
+  mean_time: int
+  median_time: int
+  stddev_time: int
+  # The average latency time for the base commit to compare against.
+  base_mean_time: Optional[int] = None
 
 
 def aggregate_all_benchmarks(
-    benchmark_files: Sequence[str]) -> Sequence[Tuple[Union[str, int]]]:
+    benchmark_files: Sequence[str]) -> Dict[str, AggregateBenchmarkLatency]:
   """Aggregates all benchmarks in the given files.
 
   Args:
@@ -96,7 +127,7 @@
     BenchmarkResults.
 
   Returns:
-  - A list of (name, mean-latency, median-latency, stddev-latency) tuples.
+  - A dict of benchmark names to AggregateBenchmarkLatency numbers.
   """
 
   pr_commit = get_required_env_var("BUILDKITE_COMMIT")
@@ -123,9 +154,10 @@
       median_time = file_results.get_aggregate_time(benchmark_index, "median")
       stddev_time = file_results.get_aggregate_time(benchmark_index, "stddev")
 
-      aggregate_results[name] = (mean_time, median_time, stddev_time)
+      aggregate_results[name] = AggregateBenchmarkLatency(
+          mean_time, median_time, stddev_time)
 
-  return sorted([(k,) + v for k, v in aggregate_results.items()])
+  return aggregate_results
 
 
 def query_base_benchmark_results(commit,
@@ -138,68 +170,209 @@
   return get_from_dashboard(f'{url}/apis/getBuild', payload, verbose=verbose)
 
 
-def get_comparsion_against_base(pr_means: Sequence[int],
-                                base_means: Sequence[int]) -> Sequence[str]:
-  """Returns a tuple of strings comparsing mean latency numbers."""
-  comparisions = []
+def add_header_and_get_markdown_table(names: Tuple[str],
+                                      means: Tuple[Any],
+                                      medians: Tuple[int],
+                                      stddevs: Tuple[int],
+                                      size_cut: Optional[int] = None) -> str:
+  """Generates a markdown table with proper headers for benchmarks.
 
-  for pr, base in zip(pr_means, base_means):
-    if base is None:
-      comparisions.append(str(pr))
-      continue
-
-    diff = abs(pr - base) / base
-    if pr > base:
-      percent = "{:.2%}".format(diff)
-      direction = "↑"
-      if diff > RESULT_EMPHASIS_THRESHOLD:
-        direction += ", 🚩"
-    elif pr < base:
-      percent = "{:.2%}".format(diff)
-      direction = "↓"
-      if diff > RESULT_EMPHASIS_THRESHOLD:
-        direction += ", 🎉"
-    else:
-      percent = "{:.0%}".format(diff)
-      direction = ""
-
-    comparisions.append(f"{pr} (vs. {base}, {percent}{direction})")
-
-  return tuple(comparisions)
-
-
-def get_benchmark_result_markdown(benchmark_files: Sequence[str],
-                                  query_base: bool,
-                                  verbose: bool = False) -> str:
-  """Gets markdown summary of all benchmarks in the given files."""
-  all_benchmarks = aggregate_all_benchmarks(benchmark_files)
-  names, means, medians, stddevs = zip(*all_benchmarks)
-
-  build_url = get_required_env_var("BUILDKITE_BUILD_URL")
-  pr_commit = get_required_env_var("BUILDKITE_COMMIT")
-  if query_base:
-    base_branch = get_required_env_var("BUILDKITE_PULL_REQUEST_BASE_BRANCH")
-    commit = get_git_commit_hash(base_branch, verbose)
-    base_benchmarks = query_base_benchmark_results(commit, verbose)
-    base_means = [base_benchmarks.get(v) for v in names]
-    means = get_comparsion_against_base(means, base_means)
-    commit_info = f"@ commit {pr_commit} (vs. base {commit})"
-  else:
-    commit_info = f"@ commit {pr_commit}"
+  Args:
+  - size_cut: If not None, only show the top N results for each table.
+  """
+  total_size = len(names)
+  if size_cut is not None:
+    names = names[0:size_cut]
+    means = means[0:size_cut]
+    medians = medians[0:size_cut]
+    stddevs = stddevs[0:size_cut]
 
   names = ("Benchmark Name",) + names
   means = ("Average Latency (ms)",) + means
   medians = ("Median Latency (ms)",) + medians
   stddevs = ("Latency Standard Deviation (ms)",) + stddevs
 
-  header = md.header("Benchmark results", 3)
-  benchmark_table = md.table([names, means, medians, stddevs])
-  link = "See more details on " + md.link("Buildkite", build_url)
-
-  return "\n\n".join([header, commit_info, benchmark_table, link])
+  table_str = md.table([names, means, medians, stddevs])
+  if size_cut is not None and size_cut < total_size:
+    table_str += "\n\n"
+    table_str += md.italics(
+        f"[Top {size_cut} out of {total_size} benchmark results showed]")
+  return table_str
 
 
-def comment_on_pr(content):
+def sort_benchmarks_and_get_table(benchmarks: Dict[str,
+                                                   AggregateBenchmarkLatency],
+                                  size_cut: Optional[int] = None):
+  """Sorts all benchmarks according to the improvement/regression ratio and
+  returns a markdown table for it.
+
+  Args:
+  - size_cut: If not None, only show the top N results for each table.
+  """
+  sorted_benchmarks = []
+  for k, v in benchmarks.items():
+    ratio = abs(v.mean_time - v.base_mean_time) / v.base_mean_time
+    sorted_benchmarks.append((k, (v.mean_time, v.base_mean_time, ratio),
+                              v.median_time, v.stddev_time))
+  # Sort according to ratio in the reverse order.
+  sorted_benchmarks.sort(key=lambda benchmark: benchmark[1][2], reverse=True)
+
+  # Split each field into its own tuple in prepration for markdown table.
+  names, means, medians, stddevs = zip(*sorted_benchmarks)
+
+  # Turn the tuple about means into a string representation.
+  str_means = []
+  for pr, base, ratio in means:
+    direction = "↑" if pr > base else ("↓" if pr < base else "")
+    str_means.append(f"{pr} (vs. {base}, {ratio:.2%}{direction})")
+  str_means = tuple(str_means)
+
+  return add_header_and_get_markdown_table(names, str_means, medians, stddevs,
+                                           size_cut)
+
+
+def categorize_benchmarks_into_tables(benchmarks: Dict[
+    str, AggregateBenchmarkLatency],
+                                      similar_threshold: float,
+                                      size_cut: Optional[int] = None) -> str:
+  """Splits benchmarks into regressed/improved/similar/raw categories and
+  returns their markdown tables.
+
+    Args:
+    - similar_threshold: the threshold under which a benchmark will be
+        considered as similar to its base commit.
+    - size_cut: If not None, only show the top N results for each table.
+    """
+  regressed, improved, similar, raw = {}, {}, {}, {}
+
+  for name, results in benchmarks.items():
+    # If no informatio about the base result. Then we cannot analyze.
+    if results.base_mean_time is None:
+      raw[name] = results
+      continue
+
+    current = results.mean_time
+    base = results.base_mean_time
+    ratio = abs(current - base) / base
+    if ratio <= similar_threshold:
+      similar[name] = results
+    elif current > base:
+      regressed[name] = results
+    else:
+      improved[name] = results
+
+  tables = []
+  if regressed:
+    tables.append(md.header("Regressed Benchmarks 🚩", 3))
+    tables.append(sort_benchmarks_and_get_table(regressed, size_cut))
+  if improved:
+    tables.append(md.header("Improved Benchmarks 🎉", 3))
+    tables.append(sort_benchmarks_and_get_table(improved, size_cut))
+  # If we want to abbreviate, similar results won't be interesting.
+  if similar and size_cut is None:
+    tables.append(md.header("Similar Benchmarks", 3))
+    tables.append(sort_benchmarks_and_get_table(similar, size_cut))
+  if raw:
+    tables.append(md.header("Similar Benchmarks", 3))
+    raw_list = [
+        (k, v.mean_time, v.median_time, v.stddev_time) for k, v in raw.items()
+    ]
+    names, means, medians, stddevs = zip(*raw_list)
+    tables.append(
+        add_header_and_get_markdown_table(names=names,
+                                          means=means,
+                                          medians=medians,
+                                          stddevs=stddevs,
+                                          size_cut=size_cut))
+  return "\n\n".join(tables)
+
+
+def get_benchmark_result_markdown(benchmark_files: Sequence[str],
+                                  query_base: bool,
+                                  verbose: bool = False) -> Tuple[str, str]:
+  """Gets the full/abbreviated markdown summary of all benchmarks in files."""
+  all_benchmarks = aggregate_all_benchmarks(benchmark_files)
+
+  build_url = get_required_env_var("BUILDKITE_BUILD_URL")
+  pr_number = get_required_env_var("BUILDKITE_PULL_REQUEST")
+  pr_commit = get_required_env_var("BUILDKITE_COMMIT")
+  pr_commit = md.link(pr_commit,
+                      f"{GITHUB_IREE_REPO_PREFIX}/commit/{pr_commit}")
+  if query_base:
+    # Update the aggregate benchmarks with base numbers.
+    base_commit = get_origin_tree_top_commit(verbose)
+    base_benchmarks = query_base_benchmark_results(base_commit, verbose)
+    for bench in base_benchmarks:
+      if bench in all_benchmarks:
+        all_benchmarks[bench].base_mean_time = base_benchmarks[bench]
+    base_commit = md.link(base_commit,
+                          f"{GITHUB_IREE_REPO_PREFIX}/commit/{base_commit}")
+    commit_info = f"@ commit {pr_commit} (vs. base {base_commit})"
+  else:
+    commit_info = f"@ commit {pr_commit}"
+
+  pr_info = md.link("Pull request",
+                    f"{GITHUB_IREE_REPO_PREFIX}/pull/{pr_number}")
+  buildkite_info = md.link("Buildkite build", build_url)
+
+  # Compose the full benchmark tables.
+  full_table = [md.header("Full Benchmark Summary", 2)]
+  full_table.append(md.unordered_list([commit_info, pr_info, buildkite_info]))
+  full_table.append(
+      categorize_benchmarks_into_tables(all_benchmarks,
+                                        SIMILAR_BECNHMARK_THRESHOLD))
+
+  # Compose the abbreviated benchmark tables.
+  abbr_table = [md.header("Abbreviated Benchmark Summary", 2)]
+  abbr_table.append(commit_info)
+  abbr_table.append(
+      categorize_benchmarks_into_tables(all_benchmarks,
+                                        SIMILAR_BECNHMARK_THRESHOLD,
+                                        TABLE_SIZE_CUT))
+  abbr_table.append("For more information:")
+  # We don't know until a Gist is really created. Use a placeholder for now
+  # and replace later.
+  full_result_info = md.link("Full benchmark result tables",
+                             "<<placeholder-link>>")
+  abbr_table.append(md.unordered_list([full_result_info, buildkite_info]))
+
+  return "\n\n".join(full_table), "\n\n".join(abbr_table)
+
+
+def post_to_gist(filename: str, content: str, verbose: bool = False):
+  """Posts the given content to a new GitHub Gist and returns the URL to it."""
+  api_token = get_required_env_var('GITHUB_TOKEN')
+  headers = {
+      "Accept": "application/vnd.github.v3+json",
+      "Authorization": f"token {api_token}",
+  }
+  payload = json.dumps({
+      "public": True,
+      "files": {
+          filename: {
+              "content": content
+          }
+      }
+  })
+
+  api_endpoint = GITHUB_GIST_API_PREFIX
+  response = requests.post(api_endpoint, data=payload, headers=headers)
+  if response.status_code != 201:
+    raise requests.RequestException(
+        f"Failed to comment on GitHub; error code: {response.status_code}")
+
+  response = response.json()
+  if verbose:
+    print(response)
+
+  if response["truncated"]:
+    raise requests.RequestException(f"Content too large and gotten truncated")
+
+  gist_id = response["id"]
+  return f"https://gist.github.com/{GITHUB_USER}/{gist_id}"
+
+
+def comment_on_pr(content, verbose: bool = False):
   """Posts the given content as comments to the current pull request."""
   pr_number = get_required_env_var("BUILDKITE_PULL_REQUEST")
   # Buildkite sets this to "false" if not running on a PR:
@@ -254,14 +427,18 @@
 
 
 def main(args):
-  benchmarks_md = get_benchmark_result_markdown(args.benchmark_files,
-                                                query_base=args.query_base,
-                                                verbose=args.verbose)
+  full_md, abbr_md = get_benchmark_result_markdown(args.benchmark_files,
+                                                   query_base=args.query_base,
+                                                   verbose=args.verbose)
 
   if args.dry_run:
-    print(benchmarks_md)
+    print(full_md, "\n\n", abbr_md)
   else:
-    comment_on_pr(benchmarks_md)
+    build_number = get_required_env_var("BUILDKITE_BUILD_NUMBER")
+    filename = f"iree-full-benchmark-result-{build_number}.md"
+    gist_url = post_to_gist(filename, full_md, args.verbose)
+    abbr_md = abbr_md.replace("<<placeholder-link>>", gist_url)
+    comment_on_pr(abbr_md, args.verbose)
 
 
 if __name__ == "__main__":