build_tools/benchmarks/common/benchmark_presentation.py - 3p/openxla/iree - Git at Google

 # Copyright 2021 The IREE Authors
 #
 # Licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 import urllib.parse
 import markdown_strings as md

 from dataclasses import dataclass
 from typing import Any, Dict, Optional, Sequence, Tuple

 from .benchmark_definition import BenchmarkResults
 from .benchmark_thresholds import BENCHMARK_THRESHOLDS, ThresholdUnit

 PERFBOARD_SERIES_PREFIX = "https://perf.iree.dev/serie?IREE?"


 @dataclass
 class AggregateBenchmarkLatency:
   """An object for describing aggregate latency numbers for a benchmark."""
   mean_time: int
   median_time: int
   stddev_time: int
   # The average latency time for the base commit to compare against.
   base_mean_time: Optional[int] = None


 def aggregate_all_benchmarks(
     benchmark_files: Sequence[str],
     expected_pr_commit: Optional[str] = None,
     verbose: bool = False) -> Dict[str, AggregateBenchmarkLatency]:
   """Aggregates all benchmarks in the given files.

   Args:
   - benchmark_files: A list of JSON files, each can be decoded as a
     BenchmarkResults.
   - expected_pr_commit: An optional Git commit SHA to match against.

   Returns:
   - A dict of benchmark names to AggregateBenchmarkLatency numbers.
   """

   aggregate_results = {}

   for benchmark_file in benchmark_files:
     with open(benchmark_file) as f:
       content = f.read()
     file_results = BenchmarkResults.from_json_str(content)

     if (expected_pr_commit is not None) and \
             (file_results.commit != expected_pr_commit):
       raise ValueError("Inconsistent pull request commit")

     for benchmark_index in range(len(file_results.benchmarks)):
       benchmark_case = file_results.benchmarks[benchmark_index]

       # Make sure each benchmark has a unique name.
       name = str(benchmark_case.benchmark_info)
       if name in aggregate_results:
         raise ValueError(f"Duplicated benchmarks: {name}")

       # Now scan all benchmark iterations and find the aggregate results.
       mean_time = file_results.get_aggregate_time(benchmark_index, "mean")
       median_time = file_results.get_aggregate_time(benchmark_index, "median")
       stddev_time = file_results.get_aggregate_time(benchmark_index, "stddev")

       aggregate_results[name] = AggregateBenchmarkLatency(
           mean_time, median_time, stddev_time)

   return aggregate_results


 def _make_benchmark_clickable(name: str) -> str:
   """Add link to the given benchmark name."""
   url = PERFBOARD_SERIES_PREFIX + urllib.parse.quote(name, safe="()[]@,")
   return md.link(name, url)


 def _add_header_and_get_markdown_table(names: Tuple[str],
                                        means: Tuple[Any],
                                        medians: Tuple[int],
                                        stddevs: Tuple[int],
                                        size_cut: Optional[int] = None) -> str:
   """Generates a markdown table with proper headers for benchmarks.

   Args:
   - size_cut: If not None, only show the top N results for each table.
   """
   total_size = len(names)
   if size_cut is not None:
     names = names[0:size_cut]
     means = means[0:size_cut]
     medians = medians[0:size_cut]
     stddevs = stddevs[0:size_cut]

   names = tuple([_make_benchmark_clickable(name) for name in names])
   names = ("Benchmark Name",) + names
   means = ("Average Latency (ms)",) + means
   medians = ("Median Latency (ms)",) + medians
   stddevs = ("Latency Standard Deviation (ms)",) + stddevs

   table_str = md.table([names, means, medians, stddevs])
   if size_cut is not None and size_cut < total_size:
     table_str += "\n\n"
     table_str += md.italics(
         f"[Top {size_cut} out of {total_size} benchmark results showed]")
   return table_str


 def _sort_benchmarks_and_get_table(benchmarks: Dict[str,
                                                     AggregateBenchmarkLatency],
                                    size_cut: Optional[int] = None):
   """Sorts all benchmarks according to the improvement/regression ratio and
   returns a markdown table for it.

   Args:
   - size_cut: If not None, only show the top N results for each table.
   """
   sorted_benchmarks = []
   for k, v in benchmarks.items():
     ratio = abs(v.mean_time - v.base_mean_time) / v.base_mean_time
     sorted_benchmarks.append((k, (v.mean_time, v.base_mean_time, ratio),
                               v.median_time, v.stddev_time))
   # Sort according to ratio in the reverse order.
   sorted_benchmarks.sort(key=lambda benchmark: benchmark[1][2], reverse=True)

   # Split each field into its own tuple in prepration for markdown table.
   names, means, medians, stddevs = zip(*sorted_benchmarks)

   # Turn the tuple about means into a string representation.
   str_means = []
   for pr, base, ratio in means:
     direction = "↑" if pr > base else ("↓" if pr < base else "")
     str_means.append(f"{pr} (vs. {base}, {ratio:.2%}{direction})")
   str_means = tuple(str_means)

   return _add_header_and_get_markdown_table(names, str_means, medians, stddevs,
                                             size_cut)


 def categorize_benchmarks_into_tables(benchmarks: Dict[
     str, AggregateBenchmarkLatency],
                                       size_cut: Optional[int] = None) -> str:
   """Splits benchmarks into regressed/improved/similar/raw categories and
   returns their markdown tables.

     Args:
     - benchmarks: A dictionary of benchmark names to its aggregate info.
     - size_cut: If not None, only show the top N results for each table.
     """
   regressed, improved, similar, raw = {}, {}, {}, {}

   for name, results in benchmarks.items():
     # If no informatio about the base result. Then we cannot analyze.
     if results.base_mean_time is None:
       raw[name] = results
       continue

     similar_threshold = None
     for threshold in BENCHMARK_THRESHOLDS:
       if threshold.regex.match(name):
         similar_threshold = threshold
         break
     if similar_threshold is None:
       raise ValueError(f"no matched threshold setting for benchmark: {name}")

     current = results.mean_time
     base = results.base_mean_time
     if similar_threshold.unit == ThresholdUnit.PERCENTAGE:
       ratio = abs(current - base) / base * 100
     else:
       ratio = abs(current - base)

     if ratio <= similar_threshold.threshold:
       similar[name] = results
     elif current > base:
       regressed[name] = results
     else:
       improved[name] = results

   tables = []
   if regressed:
     tables.append(md.header("Regressed Benchmarks 🚩", 3))
     tables.append(_sort_benchmarks_and_get_table(regressed, size_cut))
   if improved:
     tables.append(md.header("Improved Benchmarks 🎉", 3))
     tables.append(_sort_benchmarks_and_get_table(improved, size_cut))
   # If we want to abbreviate, similar results won't be interesting.
   if similar and size_cut is None:
     tables.append(md.header("Similar Benchmarks", 3))
     tables.append(_sort_benchmarks_and_get_table(similar, size_cut))
   if raw:
     tables.append(md.header("Raw Benchmarks", 3))
     raw_list = [
         (k, v.mean_time, v.median_time, v.stddev_time) for k, v in raw.items()
     ]
     names, means, medians, stddevs = zip(*raw_list)
     tables.append(
         _add_header_and_get_markdown_table(names=names,
                                            means=means,
                                            medians=medians,
                                            stddevs=stddevs,
                                            size_cut=size_cut))
   return "\n\n".join(tables)
	# Copyright 2021 The IREE Authors
	#
	# Licensed under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	import urllib.parse
	import markdown_strings as md

	from dataclasses import dataclass
	from typing import Any, Dict, Optional, Sequence, Tuple

	from .benchmark_definition import BenchmarkResults
	from .benchmark_thresholds import BENCHMARK_THRESHOLDS, ThresholdUnit

	PERFBOARD_SERIES_PREFIX = "https://perf.iree.dev/serie?IREE?"


	@dataclass
	class AggregateBenchmarkLatency:
	"""An object for describing aggregate latency numbers for a benchmark."""
	mean_time: int
	median_time: int
	stddev_time: int
	# The average latency time for the base commit to compare against.
	base_mean_time: Optional[int] = None


	def aggregate_all_benchmarks(
	benchmark_files: Sequence[str],
	expected_pr_commit: Optional[str] = None,
	verbose: bool = False) -> Dict[str, AggregateBenchmarkLatency]:
	"""Aggregates all benchmarks in the given files.

	Args:
	- benchmark_files: A list of JSON files, each can be decoded as a
	BenchmarkResults.
	- expected_pr_commit: An optional Git commit SHA to match against.

	Returns:
	- A dict of benchmark names to AggregateBenchmarkLatency numbers.
	"""

	aggregate_results = {}

	for benchmark_file in benchmark_files:
	with open(benchmark_file) as f:
	content = f.read()
	file_results = BenchmarkResults.from_json_str(content)

	if (expected_pr_commit is not None) and \
	(file_results.commit != expected_pr_commit):
	raise ValueError("Inconsistent pull request commit")

	for benchmark_index in range(len(file_results.benchmarks)):
	benchmark_case = file_results.benchmarks[benchmark_index]

	# Make sure each benchmark has a unique name.
	name = str(benchmark_case.benchmark_info)
	if name in aggregate_results:
	raise ValueError(f"Duplicated benchmarks: {name}")

	# Now scan all benchmark iterations and find the aggregate results.
	mean_time = file_results.get_aggregate_time(benchmark_index, "mean")
	median_time = file_results.get_aggregate_time(benchmark_index, "median")
	stddev_time = file_results.get_aggregate_time(benchmark_index, "stddev")

	aggregate_results[name] = AggregateBenchmarkLatency(
	mean_time, median_time, stddev_time)

	return aggregate_results


	def _make_benchmark_clickable(name: str) -> str:
	"""Add link to the given benchmark name."""
	url = PERFBOARD_SERIES_PREFIX + urllib.parse.quote(name, safe="()[]@,")
	return md.link(name, url)


	def _add_header_and_get_markdown_table(names: Tuple[str],
	means: Tuple[Any],
	medians: Tuple[int],
	stddevs: Tuple[int],
	size_cut: Optional[int] = None) -> str:
	"""Generates a markdown table with proper headers for benchmarks.

	Args:
	- size_cut: If not None, only show the top N results for each table.
	"""
	total_size = len(names)
	if size_cut is not None:
	names = names[0:size_cut]
	means = means[0:size_cut]
	medians = medians[0:size_cut]
	stddevs = stddevs[0:size_cut]

	names = tuple([_make_benchmark_clickable(name) for name in names])
	names = ("Benchmark Name",) + names
	means = ("Average Latency (ms)",) + means
	medians = ("Median Latency (ms)",) + medians
	stddevs = ("Latency Standard Deviation (ms)",) + stddevs

	table_str = md.table([names, means, medians, stddevs])
	if size_cut is not None and size_cut < total_size:
	table_str += "\n\n"
	table_str += md.italics(
	f"[Top {size_cut} out of {total_size} benchmark results showed]")
	return table_str


	def _sort_benchmarks_and_get_table(benchmarks: Dict[str,
	AggregateBenchmarkLatency],
	size_cut: Optional[int] = None):
	"""Sorts all benchmarks according to the improvement/regression ratio and
	returns a markdown table for it.

	Args:
	- size_cut: If not None, only show the top N results for each table.
	"""
	sorted_benchmarks = []
	for k, v in benchmarks.items():
	ratio = abs(v.mean_time - v.base_mean_time) / v.base_mean_time
	sorted_benchmarks.append((k, (v.mean_time, v.base_mean_time, ratio),
	v.median_time, v.stddev_time))
	# Sort according to ratio in the reverse order.
	sorted_benchmarks.sort(key=lambda benchmark: benchmark[1][2], reverse=True)

	# Split each field into its own tuple in prepration for markdown table.
	names, means, medians, stddevs = zip(*sorted_benchmarks)

	# Turn the tuple about means into a string representation.
	str_means = []
	for pr, base, ratio in means:
	direction = "↑" if pr > base else ("↓" if pr < base else "")
	str_means.append(f"{pr} (vs. {base}, {ratio:.2%}{direction})")
	str_means = tuple(str_means)

	return _add_header_and_get_markdown_table(names, str_means, medians, stddevs,
	size_cut)


	def categorize_benchmarks_into_tables(benchmarks: Dict[
	str, AggregateBenchmarkLatency],
	size_cut: Optional[int] = None) -> str:
	"""Splits benchmarks into regressed/improved/similar/raw categories and
	returns their markdown tables.

	Args:
	- benchmarks: A dictionary of benchmark names to its aggregate info.
	- size_cut: If not None, only show the top N results for each table.
	"""
	regressed, improved, similar, raw = {}, {}, {}, {}

	for name, results in benchmarks.items():
	# If no informatio about the base result. Then we cannot analyze.
	if results.base_mean_time is None:
	raw[name] = results
	continue

	similar_threshold = None
	for threshold in BENCHMARK_THRESHOLDS:
	if threshold.regex.match(name):
	similar_threshold = threshold
	break
	if similar_threshold is None:
	raise ValueError(f"no matched threshold setting for benchmark: {name}")

	current = results.mean_time
	base = results.base_mean_time
	if similar_threshold.unit == ThresholdUnit.PERCENTAGE:
	ratio = abs(current - base) / base * 100
	else:
	ratio = abs(current - base)

	if ratio <= similar_threshold.threshold:
	similar[name] = results
	elif current > base:
	regressed[name] = results
	else:
	improved[name] = results

	tables = []
	if regressed:
	tables.append(md.header("Regressed Benchmarks 🚩", 3))
	tables.append(_sort_benchmarks_and_get_table(regressed, size_cut))
	if improved:
	tables.append(md.header("Improved Benchmarks 🎉", 3))
	tables.append(_sort_benchmarks_and_get_table(improved, size_cut))
	# If we want to abbreviate, similar results won't be interesting.
	if similar and size_cut is None:
	tables.append(md.header("Similar Benchmarks", 3))
	tables.append(_sort_benchmarks_and_get_table(similar, size_cut))
	if raw:
	tables.append(md.header("Raw Benchmarks", 3))
	raw_list = [
	(k, v.mean_time, v.median_time, v.stddev_time) for k, v in raw.items()
	]
	names, means, medians, stddevs = zip(*raw_list)
	tables.append(
	_add_header_and_get_markdown_table(names=names,
	means=means,
	medians=medians,
	stddevs=stddevs,
	size_cut=size_cut))
	return "\n\n".join(tables)