blob: d5e1e8455e8394a60667c97b0673e50ff5d62eff [file] [log] [blame]
# Copyright 2021 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
from abc import ABC, abstractmethod
import json
import urllib.parse
import markdown_strings as md
from dataclasses import dataclass
from typing import Any, Callable, Dict, Generic, List, Optional, Sequence, Tuple, TypeVar
from common.benchmark_definition import BenchmarkResults, CompilationInfo, CompilationResults
from common.benchmark_thresholds import BENCHMARK_THRESHOLDS, COMPILATION_TIME_THRESHOLDS, TOTAL_DISPATCH_SIZE_THRESHOLDS, BenchmarkThreshold, ThresholdUnit
GetMetricFunc = Callable[[Any], Tuple[int, Optional[int]]]
GetTableRowFunc = Callable[[str, Any], Tuple]
PERFBOARD_SERIES_PREFIX = "https://perf.iree.dev/serie?IREE?"
BENCHMARK_RESULTS_HEADERS = [
"Benchmark Name",
"Average Latency (ms)",
"Median Latency (ms)",
"Latency Standard Deviation (ms)",
]
COMPILATION_TIME_SERIES_SUFFIX = "compilation:module:compilation-time"
TOTAL_DISPATCH_SIZE_SERIES_SUFFIX = "compilation:module:component-size:total-dispatch-size"
@dataclass
class AggregateBenchmarkLatency:
"""An object for describing aggregate latency numbers for a benchmark."""
mean_time: int
median_time: int
stddev_time: int
# The average latency time for the base commit to compare against.
base_mean_time: Optional[int] = None
@dataclass
class CompilationMetrics:
"""An object for describing the summary of statistics and the reference."""
compilation_info: CompilationInfo
compilation_time: int
total_dispatch_component_size: int
base_compilation_time: Optional[int] = None
base_total_dispatch_component_size: Optional[int] = None
T = TypeVar("T")
class MetricsToTableMapper(ABC, Generic[T]):
"""Abstract class to help map benchmark metrics to table.
It contains a set of methods to help table generator get the required
information for a metric. For example, extract the current and base metric
value, the metric thresholds, the table header of the metrics, ...
"""
@abstractmethod
def get_current_and_base_value(self, obj: T) -> Tuple[int, Optional[int]]:
"""Returns the current and base (can be None) value."""
raise NotImplementedError()
@abstractmethod
def get_series_name(self, name: str) -> str:
"""Returns the dashboard series name."""
raise NotImplementedError()
@staticmethod
@abstractmethod
def get_metric_thresholds() -> Sequence[BenchmarkThreshold]:
raise NotImplementedError()
@staticmethod
@abstractmethod
def get_table_title() -> str:
raise NotImplementedError()
@staticmethod
@abstractmethod
def get_table_header() -> str:
raise NotImplementedError()
class CompilationTimeToTable(MetricsToTableMapper[CompilationMetrics]):
"""Helper to map CompilationMetrics to compilation time column."""
def get_current_and_base_value(
self, compile_metrics: CompilationMetrics) -> Tuple[int, Optional[int]]:
return (compile_metrics.compilation_time,
compile_metrics.base_compilation_time)
def get_series_name(self, name: str) -> str:
return f"{name} [{COMPILATION_TIME_SERIES_SUFFIX}]"
@staticmethod
def get_metric_thresholds() -> Sequence[BenchmarkThreshold]:
return COMPILATION_TIME_THRESHOLDS
@staticmethod
def get_table_title() -> str:
return "Compilation Times"
@staticmethod
def get_table_header() -> str:
return "Compilation Time (ms)"
class TotalDispatchSizeToTable(MetricsToTableMapper[CompilationMetrics]):
"""Helper to map CompilationMetrics to total dispatch size column."""
def get_current_and_base_value(
self, compile_metrics: CompilationMetrics) -> Tuple[int, Optional[int]]:
return (compile_metrics.total_dispatch_component_size,
compile_metrics.base_total_dispatch_component_size)
def get_series_name(self, name: str) -> str:
return f"{name} [{TOTAL_DISPATCH_SIZE_SERIES_SUFFIX}]"
@staticmethod
def get_metric_thresholds() -> Sequence[BenchmarkThreshold]:
return TOTAL_DISPATCH_SIZE_THRESHOLDS
@staticmethod
def get_table_title() -> str:
return "Total Dispatch Sizes"
@staticmethod
def get_table_header() -> str:
return "Total Dispatch Size (bytes)"
COMPILATION_METRICS_TO_TABLE_MAPPERS: List[
MetricsToTableMapper[CompilationMetrics]] = [
CompilationTimeToTable(),
TotalDispatchSizeToTable(),
]
def aggregate_all_benchmarks(
benchmark_files: Sequence[str],
expected_pr_commit: Optional[str] = None,
verbose: bool = False) -> Dict[str, AggregateBenchmarkLatency]:
"""Aggregates all benchmarks in the given files.
Args:
- benchmark_files: A list of JSON files, each can be decoded as a
BenchmarkResults.
- expected_pr_commit: An optional Git commit SHA to match against.
Returns:
- A dict of benchmark names to AggregateBenchmarkLatency numbers.
"""
aggregate_results = {}
for benchmark_file in benchmark_files:
with open(benchmark_file) as f:
content = f.read()
file_results = BenchmarkResults.from_json_str(content)
if ((expected_pr_commit is not None) and
(file_results.commit != expected_pr_commit)):
raise ValueError("Inconsistent pull request commit")
for benchmark_index in range(len(file_results.benchmarks)):
benchmark_case = file_results.benchmarks[benchmark_index]
# Make sure each benchmark has a unique name.
name = str(benchmark_case.benchmark_info)
if name in aggregate_results:
raise ValueError(f"Duplicated benchmarks: {name}")
# Now scan all benchmark iterations and find the aggregate results.
mean_time = file_results.get_aggregate_time(benchmark_index, "mean")
median_time = file_results.get_aggregate_time(benchmark_index, "median")
stddev_time = file_results.get_aggregate_time(benchmark_index, "stddev")
aggregate_results[name] = AggregateBenchmarkLatency(
mean_time, median_time, stddev_time)
return aggregate_results
def collect_all_compilation_metrics(
compile_stats_files: Sequence[str],
expected_pr_commit: Optional[str] = None) -> Dict[str, CompilationMetrics]:
"""Collects all compilation statistics in the given files.
Args:
compile_stats_files: A list of JSON files, each can be decoded as a
CompilationResults.
expected_pr_commit: An optional Git commit SHA to match against.
Returns:
A dict of benchmark names to CompilationMetrics.
"""
compile_metrics = {}
for compile_stats_file in compile_stats_files:
with open(compile_stats_file) as f:
file_results = CompilationResults.from_json_object(json.load(f))
if ((expected_pr_commit is not None) and
(file_results.commit != expected_pr_commit)):
raise ValueError("Inconsistent pull request commit")
for compile_stats in file_results.compilation_statistics:
component_sizes = compile_stats.module_component_sizes
name = str(compile_stats.compilation_info)
compile_metrics[name] = CompilationMetrics(
compilation_info=compile_stats.compilation_info,
compilation_time=compile_stats.compilation_time,
total_dispatch_component_size=component_sizes.
total_dispatch_component_size)
return compile_metrics
def _make_series_link(name: str, series: Optional[str] = None) -> str:
"""Add link to the given benchmark name.
Args:
name: the text to show on the link.
series: the dashboard series name. Use name if None.
"""
if series is None:
series = name
url = PERFBOARD_SERIES_PREFIX + urllib.parse.quote(series, safe="()[]@,")
return md.link(name, url)
def _add_header_and_get_markdown_table(headers: Sequence[str],
rows: Sequence[Tuple],
size_cut: Optional[int] = None) -> str:
"""Generates a markdown table with headers.
Args:
headers: list of table headers.
rows: list of rows. Each row is a tuple with the same length as headers.
size_cut: If not None, only show the top N results for each table.
"""
total_size = len(rows)
if size_cut is not None:
rows = rows[0:size_cut]
columns = [[header] for header in headers]
for row in rows:
for column, item in zip(columns, row):
column.append(item)
table_str = md.table(columns)
if size_cut is not None and size_cut < total_size:
table_str += "\n\n"
table_str += md.italics(
f"[Top {size_cut} out of {total_size} results showed]")
return table_str
T = TypeVar("T")
def _categorize_on_single_metric(
metrics_map: Dict[str, T],
metric_func: GetMetricFunc,
thresholds: Sequence[BenchmarkThreshold],
) -> Tuple[Dict[str, T], Dict[str, T], Dict[str, T], Dict[str, T]]:
"""Categorize the metrics object into regressed, improved, similar, and the
raw group (the group with no base to compare to).
Args:
metrics_map: map of (name, metrics object).
metric_func: the function returns current and base value of the metric.
thresholds: list of threshold settings to match for categorizing.
Returns:
A tuple of (regressed, improved, similar, raw) groups.
"""
regressed_map = {}
improved_map = {}
similar_map = {}
raw_map = {}
for name, metrics_obj in metrics_map.items():
current, base = metric_func(metrics_obj)
if base is None:
raw_map[name] = metrics_obj
continue
similar_threshold = None
for threshold in thresholds:
if threshold.regex.match(name):
similar_threshold = threshold
break
if similar_threshold is None:
raise ValueError(f"No matched threshold setting for: {name}")
if similar_threshold.unit == ThresholdUnit.PERCENTAGE:
ratio = abs(current - base) / base * 100
else:
ratio = abs(current - base)
if ratio <= similar_threshold.threshold:
similar_map[name] = metrics_obj
elif current > base:
regressed_map[name] = metrics_obj
else:
improved_map[name] = metrics_obj
return (regressed_map, improved_map, similar_map, raw_map)
def _get_compare_text(current: int, base: Optional[int]) -> str:
"""Generates the text of comparison between current and base value. Returns
the current value if the base value is None.
"""
# If base is None, don't need to do compare.
if base is None:
return f"{current}"
ratio = abs(current - base) / base
direction = "↑" if current > base else ("↓" if current < base else "")
return f"{current} (vs. {base}, {ratio:.2%}{direction})"
def _sort_benchmarks_and_get_table(benchmarks: Dict[str,
AggregateBenchmarkLatency],
size_cut: Optional[int] = None) -> str:
"""Sorts all benchmarks according to the improvement/regression ratio and
returns a markdown table for it.
Args:
benchmarks_map: map of (name, benchmark object).
size_cut: If not None, only show the top N results for each table.
"""
sorted_rows = []
for name, benchmark in benchmarks.items():
current = benchmark.mean_time
base = benchmark.base_mean_time
ratio = abs(current - base) / base
str_mean = _get_compare_text(current, base)
clickable_name = _make_series_link(name)
sorted_rows.append((ratio, (clickable_name, str_mean, benchmark.median_time,
benchmark.stddev_time)))
sorted_rows.sort(key=lambda row: row[0], reverse=True)
return _add_header_and_get_markdown_table(
headers=BENCHMARK_RESULTS_HEADERS,
rows=[row[1] for row in sorted_rows],
size_cut=size_cut)
def categorize_benchmarks_into_tables(benchmarks: Dict[
str, AggregateBenchmarkLatency],
size_cut: Optional[int] = None) -> str:
"""Splits benchmarks into regressed/improved/similar/raw categories and
returns their markdown tables.
If size_cut is None, the table includes regressed/improved/similar/raw
categories; otherwise, the table includes regressed/improved/raw categories.
Args:
benchmarks: A dictionary of benchmark names to its aggregate info.
size_cut: If not None, only show the top N results for each table.
"""
regressed, improved, similar, raw = _categorize_on_single_metric(
benchmarks, lambda results: (results.mean_time, results.base_mean_time),
BENCHMARK_THRESHOLDS)
tables = []
if regressed:
tables.append(md.header("Regressed Latencies 🚩", 3))
tables.append(_sort_benchmarks_and_get_table(regressed, size_cut))
if improved:
tables.append(md.header("Improved Latencies 🎉", 3))
tables.append(_sort_benchmarks_and_get_table(improved, size_cut))
# If we want to abbreviate, similar results won't be interesting.
if similar and size_cut is None:
tables.append(md.header("Similar Latencies", 3))
tables.append(_sort_benchmarks_and_get_table(similar, size_cut))
if raw:
tables.append(md.header("Raw Latencies", 3))
raw_list = [(_make_series_link(k), v.mean_time, v.median_time,
v.stddev_time) for k, v in raw.items()]
tables.append(
_add_header_and_get_markdown_table(BENCHMARK_RESULTS_HEADERS,
raw_list,
size_cut=size_cut))
return "\n\n".join(tables)
def _sort_metrics_objects_and_get_table(metrics_objs: Dict[str, T],
mapper: MetricsToTableMapper[T],
headers: Sequence[str],
size_cut: Optional[int] = None) -> str:
"""Sorts all metrics objects according to the improvement/regression ratio and
returns a markdown table for it.
Args:
metrics_objs: map of (name, metrics object). All objects must contain base
value.
mapper: MetricsToTableMapper for metrics_objs.
headers: list of table headers.
size_cut: If not None, only show the top N results for each table.
"""
sorted_rows = []
for name, metrics_obj in metrics_objs.items():
current, base = mapper.get_current_and_base_value(metrics_obj)
if base is None:
raise AssertionError("Base can't be None for sorting.")
ratio = abs(current - base) / base
sorted_rows.append((ratio, (
_make_series_link(name, mapper.get_series_name(name)),
_get_compare_text(current, base),
)))
sorted_rows.sort(key=lambda row: row[0], reverse=True)
return _add_header_and_get_markdown_table(
headers=headers, rows=[row[1] for row in sorted_rows], size_cut=size_cut)
def categorize_compilation_metrics_into_tables(
compile_metrics_map: Dict[str, CompilationMetrics],
size_cut: Optional[int] = None) -> str:
"""Splits compilation metrics into regressed/improved/all categories
and returns their markdown tables.
If size_cut is None, the table includes regressed/improved/all categories;
otherwise, the table includes regressed/improved categories.
Args:
compile_metrics_map: A dictionary of benchmark names to its compilation
metrics.
size_cut: If not None, only show the top N results for each table.
"""
tables = []
for mapper in COMPILATION_METRICS_TO_TABLE_MAPPERS:
regressed, improved, _, _ = _categorize_on_single_metric(
compile_metrics_map, mapper.get_current_and_base_value,
mapper.get_metric_thresholds())
table_title = mapper.get_table_title()
table_header = mapper.get_table_header()
if regressed:
tables.append(md.header(f"Regressed {table_title} 🚩", 3))
tables.append(
_sort_metrics_objects_and_get_table(regressed, mapper,
["Benchmark Name", table_header],
size_cut))
if improved:
tables.append(md.header(f"Improved {table_title} 🎉", 3))
tables.append(
_sort_metrics_objects_and_get_table(improved, mapper,
["Benchmark Name", table_header],
size_cut))
# If we want to abbreviate, similar results won't be interesting.
if size_cut is None and compile_metrics_map:
tables.append(md.header("All Compilation Metrics", 3))
headers = ["Benchmark Name"] + [
mapper.get_table_header()
for mapper in COMPILATION_METRICS_TO_TABLE_MAPPERS
]
rows = []
for name, metrics in compile_metrics_map.items():
row = [name]
for mapper in COMPILATION_METRICS_TO_TABLE_MAPPERS:
current, base = mapper.get_current_and_base_value(metrics)
row.append(
_make_series_link(_get_compare_text(current, base),
mapper.get_series_name(name)))
rows.append(tuple(row))
tables.append(
_add_header_and_get_markdown_table(headers, rows, size_cut=size_cut))
return "\n\n".join(tables)