build_tools/benchmarks/common/benchmark_presentation.py - 3p/openxla/iree - Git at Google

 # Copyright 2021 The IREE Authors
 #
 # Licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import (
     Any,
     Callable,
     Dict,
     Generic,
     List,
     Optional,
     Sequence,
     Tuple,
     TypeVar,
     Union,
 )
 import pathlib
 import dataclasses
 import json
 import urllib.parse
 import markdown_strings as md
 import math

 from common import benchmark_definition, benchmark_thresholds
 from common.benchmark_thresholds import (
     BENCHMARK_THRESHOLDS,
     COMPILATION_TIME_THRESHOLDS,
     TOTAL_ARTIFACT_SIZE_THRESHOLDS,
     TOTAL_DISPATCH_SIZE_THRESHOLDS,
     BenchmarkThreshold,
     ThresholdUnit,
 )

 GetMetricFunc = Callable[[Any], Tuple[int, Optional[int]]]

 PERFBOARD_SERIES_PREFIX = "https://perf.iree.dev/serie?IREE?"
 BENCHMARK_RESULTS_HEADERS = [
     "Benchmark Name",
     "Average Latency (ms)",
     "Median Latency (ms)",
     "Latency Standard Deviation (ms)",
 ]
 # Since We don't have a structural way to store metric data yet, each metric is
 # assigned with a fixed id generated from uuid.uuid4(), to identify the series.
 COMPILATION_TIME_METRIC_ID = "e54cd682-c079-4c42-b4ad-d92c4bedea13"
 COMPILATION_TIME_SERIES_SUFFIX = "compilation:module:compilation-time"
 TOTAL_DISPATCH_SIZE_METRIC_ID = "9e15f7e6-383c-47ec-bd38-ecba55a5f10a"
 TOTAL_DISPATCH_SIZE_SERIES_SUFFIX = (
     "compilation:module:component-size:total-dispatch-size"
 )
 TOTAL_ARTIFACT_SIZE_METRIC_ID = "2c8a9198-c01c-45b9-a7da-69c82cf749f7"
 TOTAL_ARTIFACT_SIZE_SERIES_SUFFIX = "compilation:module:total-artifact-size"
 STREAM_IR_DISPATCH_COUNT_METRIC_ID = "7b72cd9e-43ed-4078-b6d3-20b810f9e4ad"
 STREAM_IR_DISPATCH_COUNT_SERIES_SUFFIX = "compilation:ir:stream-dispatch-count"


 @dataclass
 class AggregateBenchmarkLatency:
     """An object for describing aggregate latency numbers for a benchmark."""

     name: str
     benchmark_info: benchmark_definition.BenchmarkInfo
     mean_time: int
     median_time: int
     stddev_time: int
     # The average latency time for the base commit to compare against.
     base_mean_time: Optional[int] = None

     def __str__(self) -> str:
         return self.name


 @dataclass(frozen=True)
 class CompilationMetrics:
     """An object for describing the summary of statistics and the reference."""

     name: str
     compilation_info: benchmark_definition.CompilationInfo
     compilation_time_ms: int
     total_dispatch_component_bytes: int
     total_artifact_bytes: int
     stream_ir_dispatch_count: int
     base_compilation_time_ms: Optional[int] = None
     base_total_artifact_bytes: Optional[int] = None
     base_total_dispatch_component_bytes: Optional[int] = None
     base_stream_ir_dispatch_count: Optional[int] = None

     def __str__(self) -> str:
         return self.name


 T = TypeVar("T")


 class MetricsToTableMapper(ABC, Generic[T]):
     """Abstract class to help map benchmark metrics to table.

     It contains a set of methods to help table generator get the required
     information for a metric. For example, extract the current and base metric
     value, the metric thresholds, the table header of the metrics, ...
     """

     @abstractmethod
     def update_base_value(self, obj: T, base_value: Any) -> T:
         """Sets the base value and returns the updated metric object."""
         raise NotImplementedError()

     @abstractmethod
     def get_current_and_base_value(self, obj: T) -> Tuple[int, Optional[int]]:
         """Returns the current and base (can be None) value."""
         raise NotImplementedError()

     def get_series_id(self, benchmark_id: str) -> str:
         """Returns the dashboard series id."""
         return f"{benchmark_id}-{self.get_metric_id()}"

     @abstractmethod
     def get_metric_id(self) -> str:
         """Returns the dashboard series id."""
         raise NotImplementedError()

     @abstractmethod
     def get_series_name(self, name: str) -> str:
         """Returns the dashboard series name."""
         raise NotImplementedError()

     @abstractmethod
     def get_unit(self) -> str:
         """Returns the unit of the metric value."""
         raise NotImplementedError()

     @abstractmethod
     def get_table_header(self) -> str:
         """Returns the header of the table."""
         raise NotImplementedError()

     @staticmethod
     @abstractmethod
     def get_metric_thresholds() -> Sequence[BenchmarkThreshold]:
         raise NotImplementedError()

     @staticmethod
     @abstractmethod
     def get_table_title() -> str:
         raise NotImplementedError()


 class CompilationTimeToTable(MetricsToTableMapper[CompilationMetrics]):
     """Helper to map CompilationMetrics to compilation time column."""

     def update_base_value(
         self, compile_metrics: CompilationMetrics, base_value: Any
     ) -> CompilationMetrics:
         return dataclasses.replace(compile_metrics, base_compilation_time_ms=base_value)

     def get_current_and_base_value(
         self, compile_metrics: CompilationMetrics
     ) -> Tuple[int, Optional[int]]:
         return (
             compile_metrics.compilation_time_ms,
             compile_metrics.base_compilation_time_ms,
         )

     def get_metric_id(self) -> str:
         return COMPILATION_TIME_METRIC_ID

     def get_series_name(self, name: str) -> str:
         return f"{name} [{COMPILATION_TIME_SERIES_SUFFIX}]"

     def get_unit(self) -> str:
         return "ms"

     def get_table_header(self) -> str:
         return f"Compilation Time ({self.get_unit()})"

     @staticmethod
     def get_metric_thresholds() -> Sequence[BenchmarkThreshold]:
         return COMPILATION_TIME_THRESHOLDS

     @staticmethod
     def get_table_title() -> str:
         return "Compilation Times"


 class TotalDispatchSizeToTable(MetricsToTableMapper[CompilationMetrics]):
     """Helper to map CompilationMetrics to total dispatch size column."""

     def update_base_value(
         self, compile_metrics: CompilationMetrics, base_value: Any
     ) -> CompilationMetrics:
         return dataclasses.replace(
             compile_metrics, base_total_dispatch_component_bytes=base_value
         )

     def get_current_and_base_value(
         self, compile_metrics: CompilationMetrics
     ) -> Tuple[int, Optional[int]]:
         return (
             compile_metrics.total_dispatch_component_bytes,
             compile_metrics.base_total_dispatch_component_bytes,
         )

     def get_metric_id(self) -> str:
         return TOTAL_DISPATCH_SIZE_METRIC_ID

     def get_series_name(self, name: str) -> str:
         return f"{name} [{TOTAL_DISPATCH_SIZE_SERIES_SUFFIX}]"

     def get_unit(self) -> str:
         return "bytes"

     def get_table_header(self) -> str:
         return f"Total Dispatch Size ({self.get_unit()})"

     @staticmethod
     def get_metric_thresholds() -> Sequence[BenchmarkThreshold]:
         return TOTAL_DISPATCH_SIZE_THRESHOLDS

     @staticmethod
     def get_table_title() -> str:
         return "Total Dispatch Sizes"


 class TotalArtifactSizeToTable(MetricsToTableMapper[CompilationMetrics]):
     """Helper to map CompilationMetrics to total artifact size column."""

     def update_base_value(
         self, compile_metrics: CompilationMetrics, base_value: Any
     ) -> CompilationMetrics:
         return dataclasses.replace(
             compile_metrics, base_total_artifact_bytes=base_value
         )

     def get_current_and_base_value(
         self, compile_metrics: CompilationMetrics
     ) -> Tuple[int, Optional[int]]:
         return (
             compile_metrics.total_artifact_bytes,
             compile_metrics.base_total_artifact_bytes,
         )

     def get_metric_id(self) -> str:
         return TOTAL_ARTIFACT_SIZE_METRIC_ID

     def get_series_name(self, name: str) -> str:
         return f"{name} [{TOTAL_ARTIFACT_SIZE_SERIES_SUFFIX}]"

     def get_unit(self) -> str:
         return "bytes"

     def get_table_header(self) -> str:
         return f"Total Artifact Size ({self.get_unit()})"

     @staticmethod
     def get_metric_thresholds() -> Sequence[BenchmarkThreshold]:
         return TOTAL_ARTIFACT_SIZE_THRESHOLDS

     @staticmethod
     def get_table_title() -> str:
         return "Total Artifact Sizes"


 class StreamIRDispatchCountToTable(MetricsToTableMapper[CompilationMetrics]):
     """Helper to map CompilationMetrics to Stream IR Dispatch Count column."""

     def update_base_value(
         self, compile_metrics: CompilationMetrics, base_value: Any
     ) -> CompilationMetrics:
         return dataclasses.replace(
             compile_metrics, base_stream_ir_dispatch_count=base_value
         )

     def get_current_and_base_value(
         self, compile_metrics: CompilationMetrics
     ) -> Tuple[int, Optional[int]]:
         return (
             compile_metrics.stream_ir_dispatch_count,
             compile_metrics.base_stream_ir_dispatch_count,
         )

     def get_metric_id(self) -> str:
         return STREAM_IR_DISPATCH_COUNT_METRIC_ID

     def get_series_name(self, name: str) -> str:
         return f"{name} [{STREAM_IR_DISPATCH_COUNT_SERIES_SUFFIX}]"

     def get_unit(self) -> str:
         return "number"

     def get_table_header(self) -> str:
         return f"Stream IR Dispatch Count (# of cmd.dispatch ops)"

     @staticmethod
     def get_metric_thresholds() -> Sequence[BenchmarkThreshold]:
         return benchmark_thresholds.STREAM_IR_DISPATCH_COUNT_THRESHOLDS

     @staticmethod
     def get_table_title() -> str:
         return "Stream IR Dispatch Count (# of cmd.dispatch ops)"


 COMPILATION_METRICS_TO_TABLE_MAPPERS: List[MetricsToTableMapper[CompilationMetrics]] = [
     CompilationTimeToTable(),
     TotalDispatchSizeToTable(),
     TotalArtifactSizeToTable(),
     StreamIRDispatchCountToTable(),
 ]


 def aggregate_all_benchmarks(
     benchmark_files: Sequence[pathlib.Path], expected_pr_commit: Optional[str] = None
 ) -> Dict[str, AggregateBenchmarkLatency]:
     """Aggregates all benchmarks in the given files.

     Args:
     - benchmark_files: A list of JSON files, each can be decoded as a
       BenchmarkResults.
     - expected_pr_commit: An optional Git commit SHA to match against.

     Returns:
     - A dict of benchmark names to AggregateBenchmarkLatency numbers.
     """

     aggregate_results = {}
     benchmark_names = set()
     for benchmark_file in benchmark_files:
         file_results = benchmark_definition.BenchmarkResults.from_json_str(
             benchmark_file.read_text()
         )

         if (expected_pr_commit is not None) and (
             file_results.commit != expected_pr_commit
         ):
             raise ValueError("Inconsistent pull request commit")

         for benchmark_index in range(len(file_results.benchmarks)):
             benchmark_run = file_results.benchmarks[benchmark_index]

             series_name = str(benchmark_run.info)
             # Make sure each benchmark has a unique name.
             if series_name in benchmark_names:
                 raise ValueError(f"Duplicated benchmark name: {series_name}")
             benchmark_names.add(series_name)

             series_id = benchmark_run.info.run_config_id
             if series_id in aggregate_results:
                 raise ValueError(f"Duplicated benchmark id: {series_id}")

             aggregate_results[series_id] = AggregateBenchmarkLatency(
                 name=series_name,
                 benchmark_info=benchmark_run.info,
                 mean_time=benchmark_run.metrics.real_time.mean,
                 median_time=benchmark_run.metrics.real_time.median,
                 stddev_time=benchmark_run.metrics.real_time.stddev,
             )

     return aggregate_results


 def collect_all_compilation_metrics(
     compile_stats_files: Sequence[pathlib.Path],
     expected_pr_commit: Optional[str] = None,
 ) -> Dict[str, CompilationMetrics]:
     """Collects all compilation statistics in the given files.

     Args:
       compile_stats_files: A list of JSON files, each can be decoded as a
         CompilationResults.
       expected_pr_commit: An optional Git commit SHA to match against.

     Returns:
       A dict of benchmark names to CompilationMetrics.
     """
     compile_metrics = {}
     target_names = set()
     for compile_stats_file in compile_stats_files:
         with compile_stats_file.open("r") as f:
             file_results = benchmark_definition.CompilationResults.from_json_object(
                 json.load(f)
             )

         if (expected_pr_commit is not None) and (
             file_results.commit != expected_pr_commit
         ):
             raise ValueError("Inconsistent pull request commit")

         for compile_stats in file_results.compilation_statistics:
             component_sizes = compile_stats.module_component_sizes
             stream_dispatch_count = compile_stats.ir_stats.stream_dispatch_count

             target_name = str(compile_stats.compilation_info)
             if target_name in target_names:
                 raise ValueError(f"Duplicated target name: {target_name}")
             target_names.add(target_name)

             target_id = compile_stats.compilation_info.gen_config_id
             if target_id in compile_metrics:
                 raise ValueError(f"Duplicated target id: {target_id}")

             compile_metrics[target_id] = CompilationMetrics(
                 name=target_name,
                 compilation_info=compile_stats.compilation_info,
                 compilation_time_ms=compile_stats.compilation_time_ms,
                 total_artifact_bytes=component_sizes.file_bytes,
                 total_dispatch_component_bytes=component_sizes.total_dispatch_component_bytes,
                 stream_ir_dispatch_count=stream_dispatch_count,
             )

     return compile_metrics


 def make_series_link(name: str, series_id: str) -> str:
     """Add link to the given benchmark name.

     Args:
       name: the text to show on the link.
       series_id: the dashboard series id.
     """
     url = PERFBOARD_SERIES_PREFIX + urllib.parse.quote(series_id, safe="()[]@,")
     return md.link(name, url)


 def _add_header_and_get_markdown_table(
     headers: Sequence[str], rows: Sequence[Tuple], size_cut: Optional[int] = None
 ) -> str:
     """Generates a markdown table with headers.

     Args:
       headers: list of table headers.
       rows: list of rows. Each row is a tuple with the same length as headers.
       size_cut: If not None, only show the top N results for each table.
     """

     total_size = len(rows)
     if size_cut is not None:
         rows = rows[0:size_cut]

     columns = [[header] for header in headers]
     for row in rows:
         for column, item in zip(columns, row):
             column.append(item)

     table_str = md.table(columns)
     if size_cut is not None and size_cut < total_size:
         table_str += "\n\n"
         table_str += md.italics(f"[Top {size_cut} out of {total_size} results showed]")
     return table_str


 T = TypeVar("T")


 def _categorize_on_single_metric(
     metrics_map: Dict[str, T],
     metric_func: GetMetricFunc,
     thresholds: Sequence[BenchmarkThreshold],
     metric_unit: str,
 ) -> Tuple[Dict[str, T], Dict[str, T], Dict[str, T], Dict[str, T]]:
     """Categorize the metrics object into regressed, improved, similar, and the
     raw group (the group with no base to compare to).

     Args:
       metrics_map: map of (series_id, metrics object).
       metric_func: the function returns current and base value of the metric.
       thresholds: list of threshold settings to match for categorizing.
     Returns:
       A tuple of (regressed, improved, similar, raw) groups.
     """

     regressed_map = {}
     improved_map = {}
     similar_map = {}
     raw_map = {}
     for series_id, metrics_obj in metrics_map.items():
         current, base = metric_func(metrics_obj)
         if base is None:
             raw_map[series_id] = metrics_obj
             continue

         series_name = str(metrics_obj)
         similar_threshold = None
         for threshold in thresholds:
             if threshold.regex.match(series_name):
                 similar_threshold = threshold
                 break
         if similar_threshold is None:
             raise ValueError(f"No matched threshold setting for: {series_name}")

         if similar_threshold.unit == ThresholdUnit.PERCENTAGE:
             ratio = abs(current - base) / base * 100
         elif similar_threshold.unit.value == metric_unit:
             ratio = abs(current - base)
         else:
             raise ValueError(
                 f"Mismatch between metric unit '{metric_unit}' and threshold unit '{similar_threshold.unit.value}'"
             )

         if ratio <= similar_threshold.threshold:
             similar_map[series_id] = metrics_obj
         elif current > base:
             regressed_map[series_id] = metrics_obj
         else:
             improved_map[series_id] = metrics_obj

     return (regressed_map, improved_map, similar_map, raw_map)


 def _get_fixed_point_str(value: Union[int, float], digits=3) -> str:
     if isinstance(value, int) or value.is_integer():
         return str(math.floor(value))
     return f"{{:.{digits}f}}".format(value)


 def _get_compare_text(current: float, base: Optional[int]) -> str:
     """Generates the text of comparison between current and base value. Returns
     the current value if the base value is None.
     """
     # If base is None, don't need to do compare.
     if base is None:
         return f"{_get_fixed_point_str(current)}"

     ratio = abs(current - base) / base
     direction = "↑" if current > base else ("↓" if current < base else "")
     return f"{_get_fixed_point_str(current)} (vs. {_get_fixed_point_str(base)}, {ratio:.2%}{direction})"


 def _sort_benchmarks_and_get_table(
     benchmarks: Dict[str, AggregateBenchmarkLatency], size_cut: Optional[int] = None
 ) -> str:
     """Sorts all benchmarks according to the improvement/regression ratio and
     returns a markdown table for it.

     Args:
       benchmarks_map: map of (series_id, benchmark object).
       size_cut: If not None, only show the top N results for each table.
     """
     sorted_rows = []
     for series_id, benchmark in benchmarks.items():
         current = benchmark.mean_time / 1e6
         base = benchmark.base_mean_time / 1e6
         ratio = abs(current - base) / base
         str_mean = _get_compare_text(current, base)
         clickable_name = make_series_link(benchmark.name, series_id)
         sorted_rows.append(
             (
                 ratio,
                 (
                     clickable_name,
                     str_mean,
                     f"{_get_fixed_point_str(benchmark.median_time / 1e6)}",
                     f"{_get_fixed_point_str(benchmark.stddev_time / 1e6)}",
                 ),
             )
         )
     sorted_rows.sort(key=lambda row: row[0], reverse=True)

     return _add_header_and_get_markdown_table(
         headers=BENCHMARK_RESULTS_HEADERS,
         rows=[row[1] for row in sorted_rows],
         size_cut=size_cut,
     )


 def categorize_benchmarks_into_tables(
     benchmarks: Dict[str, AggregateBenchmarkLatency], size_cut: Optional[int] = None
 ) -> str:
     """Splits benchmarks into regressed/improved/similar/raw categories and
     returns their markdown tables.

     If size_cut is None, the table includes regressed/improved/similar/raw
     categories; otherwise, the table includes regressed/improved/raw categories.

     Args:
       benchmarks: A dictionary of benchmark names to its aggregate info.
       size_cut: If not None, only show the top N results for each table.
     """
     regressed, improved, similar, raw = _categorize_on_single_metric(
         benchmarks,
         lambda results: (results.mean_time, results.base_mean_time),
         BENCHMARK_THRESHOLDS,
         "ns",
     )

     tables = []
     if regressed:
         tables.append(md.header("Regressed Latencies 🚩", 3))
         tables.append(_sort_benchmarks_and_get_table(regressed, size_cut))
     if improved:
         tables.append(md.header("Improved Latencies 🎉", 3))
         tables.append(_sort_benchmarks_and_get_table(improved, size_cut))
     # If we want to abbreviate, similar results won't be interesting.
     if similar and size_cut is None:
         tables.append(md.header("Similar Latencies", 3))
         tables.append(_sort_benchmarks_and_get_table(similar, size_cut))
     if raw:
         tables.append(md.header("Raw Latencies", 3))
         raw_list = [
             (
                 make_series_link(name=v.name, series_id=k),
                 f"{_get_fixed_point_str(v.mean_time / 1e6)}",
                 f"{_get_fixed_point_str(v.median_time / 1e6)}",
                 f"{_get_fixed_point_str(v.stddev_time / 1e6)}",
             )
             for k, v in raw.items()
         ]
         tables.append(
             _add_header_and_get_markdown_table(
                 BENCHMARK_RESULTS_HEADERS, raw_list, size_cut=size_cut
             )
         )
     return "\n\n".join(tables)


 def _sort_metrics_objects_and_get_table(
     metrics_objs: Dict[str, T],
     mapper: MetricsToTableMapper[T],
     headers: Sequence[str],
     size_cut: Optional[int] = None,
 ) -> str:
     """Sorts all metrics objects according to the improvement/regression ratio and
     returns a markdown table for it.

     Args:
       metrics_objs: map of (target_id, CompilationMetrics). All objects must
         contain base value.
       mapper: MetricsToTableMapper for metrics_objs.
       headers: list of table headers.
       size_cut: If not None, only show the top N results for each table.
     """
     sorted_rows = []
     for target_id, metrics_obj in metrics_objs.items():
         current, base = mapper.get_current_and_base_value(metrics_obj)
         if base is None:
             raise AssertionError("Base can't be None for sorting.")
         ratio = abs(current - base) / base
         sorted_rows.append(
             (
                 ratio,
                 (
                     make_series_link(str(metrics_obj), mapper.get_series_id(target_id)),
                     _get_compare_text(current, base),
                 ),
             )
         )
     sorted_rows.sort(key=lambda row: row[0], reverse=True)

     return _add_header_and_get_markdown_table(
         headers=headers, rows=[row[1] for row in sorted_rows], size_cut=size_cut
     )


 def categorize_compilation_metrics_into_tables(
     compile_metrics_map: Dict[str, CompilationMetrics], size_cut: Optional[int] = None
 ) -> str:
     """Splits compilation metrics into regressed/improved/all categories
     and returns their markdown tables.

     If size_cut is None, the table includes regressed/improved/all categories;
     otherwise, the table includes regressed/improved categories.

     Args:
       compile_metrics_map: A dictionary of benchmark names to its compilation
         metrics.
       size_cut: If not None, only show the top N results for each table.
     """

     tables = []
     for mapper in COMPILATION_METRICS_TO_TABLE_MAPPERS:
         regressed, improved, _, _ = _categorize_on_single_metric(
             compile_metrics_map,
             mapper.get_current_and_base_value,
             mapper.get_metric_thresholds(),
             mapper.get_unit(),
         )

         table_title = mapper.get_table_title()
         table_header = mapper.get_table_header()
         if regressed:
             tables.append(md.header(f"Regressed {table_title} 🚩", 3))
             tables.append(
                 _sort_metrics_objects_and_get_table(
                     metrics_objs=regressed,
                     mapper=mapper,
                     headers=["Benchmark Name", table_header],
                     size_cut=size_cut,
                 )
             )
         if improved:
             tables.append(md.header(f"Improved {table_title} 🎉", 3))
             tables.append(
                 _sort_metrics_objects_and_get_table(
                     metrics_objs=improved,
                     mapper=mapper,
                     headers=["Benchmark Name", table_header],
                     size_cut=size_cut,
                 )
             )

     # If we want to abbreviate, similar results won't be interesting.
     if size_cut is None and compile_metrics_map:
         tables.append(md.header("All Compilation Metrics", 3))
         headers = ["Benchmark Name"] + [
             mapper.get_table_header() for mapper in COMPILATION_METRICS_TO_TABLE_MAPPERS
         ]
         rows = []
         for target_id, metrics in compile_metrics_map.items():
             row = [metrics.name]
             for mapper in COMPILATION_METRICS_TO_TABLE_MAPPERS:
                 current, base = mapper.get_current_and_base_value(metrics)
                 row.append(
                     make_series_link(
                         _get_compare_text(current, base),
                         mapper.get_series_id(target_id),
                     )
                 )
             rows.append(tuple(row))

         tables.append(
             _add_header_and_get_markdown_table(headers, rows, size_cut=size_cut)
         )

     return "\n\n".join(tables)