blob: 9cab2a79f61dbba2f6050ba7975c391979af1b6a [file] [log] [blame]
# Copyright 2021 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import (Any, Callable, Dict, Generic, List, Optional, Sequence,
Tuple, TypeVar, Union)
import pathlib
import dataclasses
import json
import urllib.parse
import markdown_strings as md
import math
from common import benchmark_definition, benchmark_thresholds
from common.benchmark_thresholds import (BENCHMARK_THRESHOLDS,
COMPILATION_TIME_THRESHOLDS,
TOTAL_ARTIFACT_SIZE_THRESHOLDS,
TOTAL_DISPATCH_SIZE_THRESHOLDS,
BenchmarkThreshold, ThresholdUnit)
GetMetricFunc = Callable[[Any], Tuple[int, Optional[int]]]
PERFBOARD_SERIES_PREFIX = "https://perf.iree.dev/serie?IREE?"
BENCHMARK_RESULTS_HEADERS = [
"Benchmark Name",
"Average Latency (ms)",
"Median Latency (ms)",
"Latency Standard Deviation (ms)",
]
# Since We don't have a structural way to store metric data yet, each metric is
# assigned with a fixed id generated from uuid.uuid4(), to identify the series.
COMPILATION_TIME_METRIC_ID = "e54cd682-c079-4c42-b4ad-d92c4bedea13"
COMPILATION_TIME_SERIES_SUFFIX = "compilation:module:compilation-time"
TOTAL_DISPATCH_SIZE_METRIC_ID = "9e15f7e6-383c-47ec-bd38-ecba55a5f10a"
TOTAL_DISPATCH_SIZE_SERIES_SUFFIX = "compilation:module:component-size:total-dispatch-size"
TOTAL_ARTIFACT_SIZE_METRIC_ID = "2c8a9198-c01c-45b9-a7da-69c82cf749f7"
TOTAL_ARTIFACT_SIZE_SERIES_SUFFIX = "compilation:module:total-artifact-size"
STREAM_IR_DISPATCH_COUNT_METRIC_ID = "7b72cd9e-43ed-4078-b6d3-20b810f9e4ad"
STREAM_IR_DISPATCH_COUNT_SERIES_SUFFIX = "compilation:ir:stream-dispatch-count"
@dataclass
class AggregateBenchmarkLatency:
"""An object for describing aggregate latency numbers for a benchmark."""
name: str
benchmark_info: benchmark_definition.BenchmarkInfo
mean_time: int
median_time: int
stddev_time: int
# The average latency time for the base commit to compare against.
base_mean_time: Optional[int] = None
def __str__(self) -> str:
return self.name
@dataclass(frozen=True)
class CompilationMetrics:
"""An object for describing the summary of statistics and the reference."""
name: str
compilation_info: benchmark_definition.CompilationInfo
compilation_time_ms: int
total_dispatch_component_bytes: int
total_artifact_bytes: int
stream_ir_dispatch_count: int
base_compilation_time_ms: Optional[int] = None
base_total_artifact_bytes: Optional[int] = None
base_total_dispatch_component_bytes: Optional[int] = None
base_stream_ir_dispatch_count: Optional[int] = None
def __str__(self) -> str:
return self.name
T = TypeVar("T")
class MetricsToTableMapper(ABC, Generic[T]):
"""Abstract class to help map benchmark metrics to table.
It contains a set of methods to help table generator get the required
information for a metric. For example, extract the current and base metric
value, the metric thresholds, the table header of the metrics, ...
"""
@abstractmethod
def update_base_value(self, obj: T, base_value: Any) -> T:
"""Sets the base value and returns the updated metric object."""
raise NotImplementedError()
@abstractmethod
def get_current_and_base_value(self, obj: T) -> Tuple[int, Optional[int]]:
"""Returns the current and base (can be None) value."""
raise NotImplementedError()
def get_series_id(self, benchmark_id: str) -> str:
"""Returns the dashboard series id."""
return f"{benchmark_id}-{self.get_metric_id()}"
@abstractmethod
def get_metric_id(self) -> str:
"""Returns the dashboard series id."""
raise NotImplementedError()
@abstractmethod
def get_series_name(self, name: str) -> str:
"""Returns the dashboard series name."""
raise NotImplementedError()
@abstractmethod
def get_unit(self) -> str:
"""Returns the unit of the metric value."""
raise NotImplementedError()
@abstractmethod
def get_table_header(self) -> str:
"""Returns the header of the table."""
raise NotImplementedError()
@staticmethod
@abstractmethod
def get_metric_thresholds() -> Sequence[BenchmarkThreshold]:
raise NotImplementedError()
@staticmethod
@abstractmethod
def get_table_title() -> str:
raise NotImplementedError()
class CompilationTimeToTable(MetricsToTableMapper[CompilationMetrics]):
"""Helper to map CompilationMetrics to compilation time column."""
def update_base_value(self, compile_metrics: CompilationMetrics,
base_value: Any) -> CompilationMetrics:
return dataclasses.replace(compile_metrics,
base_compilation_time_ms=base_value)
def get_current_and_base_value(
self, compile_metrics: CompilationMetrics) -> Tuple[int, Optional[int]]:
return (compile_metrics.compilation_time_ms,
compile_metrics.base_compilation_time_ms)
def get_metric_id(self) -> str:
return COMPILATION_TIME_METRIC_ID
def get_series_name(self, name: str) -> str:
return f"{name} [{COMPILATION_TIME_SERIES_SUFFIX}]"
def get_unit(self) -> str:
return "ms"
def get_table_header(self) -> str:
return f"Compilation Time ({self.get_unit()})"
@staticmethod
def get_metric_thresholds() -> Sequence[BenchmarkThreshold]:
return COMPILATION_TIME_THRESHOLDS
@staticmethod
def get_table_title() -> str:
return "Compilation Times"
class TotalDispatchSizeToTable(MetricsToTableMapper[CompilationMetrics]):
"""Helper to map CompilationMetrics to total dispatch size column."""
def update_base_value(self, compile_metrics: CompilationMetrics,
base_value: Any) -> CompilationMetrics:
return dataclasses.replace(compile_metrics,
base_total_dispatch_component_bytes=base_value)
def get_current_and_base_value(
self, compile_metrics: CompilationMetrics) -> Tuple[int, Optional[int]]:
return (compile_metrics.total_dispatch_component_bytes,
compile_metrics.base_total_dispatch_component_bytes)
def get_metric_id(self) -> str:
return TOTAL_DISPATCH_SIZE_METRIC_ID
def get_series_name(self, name: str) -> str:
return f"{name} [{TOTAL_DISPATCH_SIZE_SERIES_SUFFIX}]"
def get_unit(self) -> str:
return "bytes"
def get_table_header(self) -> str:
return f"Total Dispatch Size ({self.get_unit()})"
@staticmethod
def get_metric_thresholds() -> Sequence[BenchmarkThreshold]:
return TOTAL_DISPATCH_SIZE_THRESHOLDS
@staticmethod
def get_table_title() -> str:
return "Total Dispatch Sizes"
class TotalArtifactSizeToTable(MetricsToTableMapper[CompilationMetrics]):
"""Helper to map CompilationMetrics to total artifact size column."""
def update_base_value(self, compile_metrics: CompilationMetrics,
base_value: Any) -> CompilationMetrics:
return dataclasses.replace(compile_metrics,
base_total_artifact_bytes=base_value)
def get_current_and_base_value(
self, compile_metrics: CompilationMetrics) -> Tuple[int, Optional[int]]:
return (compile_metrics.total_artifact_bytes,
compile_metrics.base_total_artifact_bytes)
def get_metric_id(self) -> str:
return TOTAL_ARTIFACT_SIZE_METRIC_ID
def get_series_name(self, name: str) -> str:
return f"{name} [{TOTAL_ARTIFACT_SIZE_SERIES_SUFFIX}]"
def get_unit(self) -> str:
return "bytes"
def get_table_header(self) -> str:
return f"Total Artifact Size ({self.get_unit()})"
@staticmethod
def get_metric_thresholds() -> Sequence[BenchmarkThreshold]:
return TOTAL_ARTIFACT_SIZE_THRESHOLDS
@staticmethod
def get_table_title() -> str:
return "Total Artifact Sizes"
class StreamIRDispatchCountToTable(MetricsToTableMapper[CompilationMetrics]):
"""Helper to map CompilationMetrics to Stream IR Dispatch Count column."""
def update_base_value(self, compile_metrics: CompilationMetrics,
base_value: Any) -> CompilationMetrics:
return dataclasses.replace(compile_metrics,
base_stream_ir_dispatch_count=base_value)
def get_current_and_base_value(
self, compile_metrics: CompilationMetrics) -> Tuple[int, Optional[int]]:
return (compile_metrics.stream_ir_dispatch_count,
compile_metrics.base_stream_ir_dispatch_count)
def get_metric_id(self) -> str:
return STREAM_IR_DISPATCH_COUNT_METRIC_ID
def get_series_name(self, name: str) -> str:
return f"{name} [{STREAM_IR_DISPATCH_COUNT_SERIES_SUFFIX}]"
def get_unit(self) -> str:
return "number"
def get_table_header(self) -> str:
return f"Stream IR Dispatch Count (# of cmd.dispatch ops)"
@staticmethod
def get_metric_thresholds() -> Sequence[BenchmarkThreshold]:
return benchmark_thresholds.STREAM_IR_DISPATCH_COUNT_THRESHOLDS
@staticmethod
def get_table_title() -> str:
return "Stream IR Dispatch Count (# of cmd.dispatch ops)"
COMPILATION_METRICS_TO_TABLE_MAPPERS: List[
MetricsToTableMapper[CompilationMetrics]] = [
CompilationTimeToTable(),
TotalDispatchSizeToTable(),
TotalArtifactSizeToTable(),
StreamIRDispatchCountToTable(),
]
def aggregate_all_benchmarks(
benchmark_files: Sequence[pathlib.Path],
expected_pr_commit: Optional[str] = None
) -> Dict[str, AggregateBenchmarkLatency]:
"""Aggregates all benchmarks in the given files.
Args:
- benchmark_files: A list of JSON files, each can be decoded as a
BenchmarkResults.
- expected_pr_commit: An optional Git commit SHA to match against.
Returns:
- A dict of benchmark names to AggregateBenchmarkLatency numbers.
"""
aggregate_results = {}
benchmark_names = set()
for benchmark_file in benchmark_files:
file_results = benchmark_definition.BenchmarkResults.from_json_str(
benchmark_file.read_text())
if ((expected_pr_commit is not None) and
(file_results.commit != expected_pr_commit)):
raise ValueError("Inconsistent pull request commit")
for benchmark_index in range(len(file_results.benchmarks)):
benchmark_run = file_results.benchmarks[benchmark_index]
series_name = str(benchmark_run.info)
# Make sure each benchmark has a unique name.
if series_name in benchmark_names:
raise ValueError(f"Duplicated benchmark name: {series_name}")
benchmark_names.add(series_name)
series_id = benchmark_run.info.run_config_id
if series_id in aggregate_results:
raise ValueError(f"Duplicated benchmark id: {series_id}")
aggregate_results[series_id] = AggregateBenchmarkLatency(
name=series_name,
benchmark_info=benchmark_run.info,
mean_time=benchmark_run.metrics.real_time.mean,
median_time=benchmark_run.metrics.real_time.median,
stddev_time=benchmark_run.metrics.real_time.stddev)
return aggregate_results
def collect_all_compilation_metrics(
compile_stats_files: Sequence[pathlib.Path],
expected_pr_commit: Optional[str] = None) -> Dict[str, CompilationMetrics]:
"""Collects all compilation statistics in the given files.
Args:
compile_stats_files: A list of JSON files, each can be decoded as a
CompilationResults.
expected_pr_commit: An optional Git commit SHA to match against.
Returns:
A dict of benchmark names to CompilationMetrics.
"""
compile_metrics = {}
target_names = set()
for compile_stats_file in compile_stats_files:
with compile_stats_file.open("r") as f:
file_results = benchmark_definition.CompilationResults.from_json_object(
json.load(f))
if ((expected_pr_commit is not None) and
(file_results.commit != expected_pr_commit)):
raise ValueError("Inconsistent pull request commit")
for compile_stats in file_results.compilation_statistics:
component_sizes = compile_stats.module_component_sizes
stream_dispatch_count = compile_stats.ir_stats.stream_dispatch_count
target_name = str(compile_stats.compilation_info)
if target_name in target_names:
raise ValueError(f"Duplicated target name: {target_name}")
target_names.add(target_name)
target_id = compile_stats.compilation_info.gen_config_id
if target_id in compile_metrics:
raise ValueError(f"Duplicated target id: {target_id}")
compile_metrics[target_id] = CompilationMetrics(
name=target_name,
compilation_info=compile_stats.compilation_info,
compilation_time_ms=compile_stats.compilation_time_ms,
total_artifact_bytes=component_sizes.file_bytes,
total_dispatch_component_bytes=component_sizes.
total_dispatch_component_bytes,
stream_ir_dispatch_count=stream_dispatch_count)
return compile_metrics
def _make_series_link(name: str, series_id: str) -> str:
"""Add link to the given benchmark name.
Args:
name: the text to show on the link.
series_id: the dashboard series id.
"""
url = PERFBOARD_SERIES_PREFIX + urllib.parse.quote(series_id, safe="()[]@,")
return md.link(name, url)
def _add_header_and_get_markdown_table(headers: Sequence[str],
rows: Sequence[Tuple],
size_cut: Optional[int] = None) -> str:
"""Generates a markdown table with headers.
Args:
headers: list of table headers.
rows: list of rows. Each row is a tuple with the same length as headers.
size_cut: If not None, only show the top N results for each table.
"""
total_size = len(rows)
if size_cut is not None:
rows = rows[0:size_cut]
columns = [[header] for header in headers]
for row in rows:
for column, item in zip(columns, row):
column.append(item)
table_str = md.table(columns)
if size_cut is not None and size_cut < total_size:
table_str += "\n\n"
table_str += md.italics(
f"[Top {size_cut} out of {total_size} results showed]")
return table_str
T = TypeVar("T")
def _categorize_on_single_metric(
metrics_map: Dict[str, T],
metric_func: GetMetricFunc,
thresholds: Sequence[BenchmarkThreshold],
metric_unit: str,
) -> Tuple[Dict[str, T], Dict[str, T], Dict[str, T], Dict[str, T]]:
"""Categorize the metrics object into regressed, improved, similar, and the
raw group (the group with no base to compare to).
Args:
metrics_map: map of (series_id, metrics object).
metric_func: the function returns current and base value of the metric.
thresholds: list of threshold settings to match for categorizing.
Returns:
A tuple of (regressed, improved, similar, raw) groups.
"""
regressed_map = {}
improved_map = {}
similar_map = {}
raw_map = {}
for series_id, metrics_obj in metrics_map.items():
current, base = metric_func(metrics_obj)
if base is None:
raw_map[series_id] = metrics_obj
continue
series_name = str(metrics_obj)
similar_threshold = None
for threshold in thresholds:
if threshold.regex.match(series_name):
similar_threshold = threshold
break
if similar_threshold is None:
raise ValueError(f"No matched threshold setting for: {series_name}")
if similar_threshold.unit == ThresholdUnit.PERCENTAGE:
ratio = abs(current - base) / base * 100
elif similar_threshold.unit.value == metric_unit:
ratio = abs(current - base)
else:
raise ValueError(
f"Mismatch between metric unit '{metric_unit}' and threshold unit '{similar_threshold.unit.value}'"
)
if ratio <= similar_threshold.threshold:
similar_map[series_id] = metrics_obj
elif current > base:
regressed_map[series_id] = metrics_obj
else:
improved_map[series_id] = metrics_obj
return (regressed_map, improved_map, similar_map, raw_map)
def _get_fixed_point_str(value: Union[int, float], digits=3) -> str:
if isinstance(value, int) or value.is_integer():
return str(math.floor(value))
return f"{{:.{digits}f}}".format(value)
def _get_compare_text(current: float, base: Optional[int]) -> str:
"""Generates the text of comparison between current and base value. Returns
the current value if the base value is None.
"""
# If base is None, don't need to do compare.
if base is None:
return f"{_get_fixed_point_str(current)}"
ratio = abs(current - base) / base
direction = "↑" if current > base else ("↓" if current < base else "")
return f"{_get_fixed_point_str(current)} (vs. {_get_fixed_point_str(base)}, {ratio:.2%}{direction})"
def _sort_benchmarks_and_get_table(benchmarks: Dict[str,
AggregateBenchmarkLatency],
size_cut: Optional[int] = None) -> str:
"""Sorts all benchmarks according to the improvement/regression ratio and
returns a markdown table for it.
Args:
benchmarks_map: map of (series_id, benchmark object).
size_cut: If not None, only show the top N results for each table.
"""
sorted_rows = []
for series_id, benchmark in benchmarks.items():
current = benchmark.mean_time / 1e6
base = benchmark.base_mean_time / 1e6
ratio = abs(current - base) / base
str_mean = _get_compare_text(current, base)
clickable_name = _make_series_link(benchmark.name, series_id)
sorted_rows.append(
(ratio, (clickable_name, str_mean,
f"{_get_fixed_point_str(benchmark.median_time / 1e6)}",
f"{_get_fixed_point_str(benchmark.stddev_time / 1e6)}")))
sorted_rows.sort(key=lambda row: row[0], reverse=True)
return _add_header_and_get_markdown_table(
headers=BENCHMARK_RESULTS_HEADERS,
rows=[row[1] for row in sorted_rows],
size_cut=size_cut)
def categorize_benchmarks_into_tables(benchmarks: Dict[
str, AggregateBenchmarkLatency],
size_cut: Optional[int] = None) -> str:
"""Splits benchmarks into regressed/improved/similar/raw categories and
returns their markdown tables.
If size_cut is None, the table includes regressed/improved/similar/raw
categories; otherwise, the table includes regressed/improved/raw categories.
Args:
benchmarks: A dictionary of benchmark names to its aggregate info.
size_cut: If not None, only show the top N results for each table.
"""
regressed, improved, similar, raw = _categorize_on_single_metric(
benchmarks, lambda results: (results.mean_time, results.base_mean_time),
BENCHMARK_THRESHOLDS, "ns")
tables = []
if regressed:
tables.append(md.header("Regressed Latencies 🚩", 3))
tables.append(_sort_benchmarks_and_get_table(regressed, size_cut))
if improved:
tables.append(md.header("Improved Latencies 🎉", 3))
tables.append(_sort_benchmarks_and_get_table(improved, size_cut))
# If we want to abbreviate, similar results won't be interesting.
if similar and size_cut is None:
tables.append(md.header("Similar Latencies", 3))
tables.append(_sort_benchmarks_and_get_table(similar, size_cut))
if raw:
tables.append(md.header("Raw Latencies", 3))
raw_list = [(_make_series_link(name=v.name, series_id=k),
f"{_get_fixed_point_str(v.mean_time / 1e6)}",
f"{_get_fixed_point_str(v.median_time / 1e6)}",
f"{_get_fixed_point_str(v.stddev_time / 1e6)}")
for k, v in raw.items()]
tables.append(
_add_header_and_get_markdown_table(BENCHMARK_RESULTS_HEADERS,
raw_list,
size_cut=size_cut))
return "\n\n".join(tables)
def _sort_metrics_objects_and_get_table(metrics_objs: Dict[str, T],
mapper: MetricsToTableMapper[T],
headers: Sequence[str],
size_cut: Optional[int] = None) -> str:
"""Sorts all metrics objects according to the improvement/regression ratio and
returns a markdown table for it.
Args:
metrics_objs: map of (target_id, CompilationMetrics). All objects must
contain base value.
mapper: MetricsToTableMapper for metrics_objs.
headers: list of table headers.
size_cut: If not None, only show the top N results for each table.
"""
sorted_rows = []
for target_id, metrics_obj in metrics_objs.items():
current, base = mapper.get_current_and_base_value(metrics_obj)
if base is None:
raise AssertionError("Base can't be None for sorting.")
ratio = abs(current - base) / base
sorted_rows.append((ratio, (
_make_series_link(str(metrics_obj), mapper.get_series_id(target_id)),
_get_compare_text(current, base),
)))
sorted_rows.sort(key=lambda row: row[0], reverse=True)
return _add_header_and_get_markdown_table(
headers=headers, rows=[row[1] for row in sorted_rows], size_cut=size_cut)
def categorize_compilation_metrics_into_tables(
compile_metrics_map: Dict[str, CompilationMetrics],
size_cut: Optional[int] = None) -> str:
"""Splits compilation metrics into regressed/improved/all categories
and returns their markdown tables.
If size_cut is None, the table includes regressed/improved/all categories;
otherwise, the table includes regressed/improved categories.
Args:
compile_metrics_map: A dictionary of benchmark names to its compilation
metrics.
size_cut: If not None, only show the top N results for each table.
"""
tables = []
for mapper in COMPILATION_METRICS_TO_TABLE_MAPPERS:
regressed, improved, _, _ = _categorize_on_single_metric(
compile_metrics_map, mapper.get_current_and_base_value,
mapper.get_metric_thresholds(), mapper.get_unit())
table_title = mapper.get_table_title()
table_header = mapper.get_table_header()
if regressed:
tables.append(md.header(f"Regressed {table_title} 🚩", 3))
tables.append(
_sort_metrics_objects_and_get_table(
metrics_objs=regressed,
mapper=mapper,
headers=["Benchmark Name", table_header],
size_cut=size_cut))
if improved:
tables.append(md.header(f"Improved {table_title} 🎉", 3))
tables.append(
_sort_metrics_objects_and_get_table(
metrics_objs=improved,
mapper=mapper,
headers=["Benchmark Name", table_header],
size_cut=size_cut))
# If we want to abbreviate, similar results won't be interesting.
if size_cut is None and compile_metrics_map:
tables.append(md.header("All Compilation Metrics", 3))
headers = ["Benchmark Name"] + [
mapper.get_table_header()
for mapper in COMPILATION_METRICS_TO_TABLE_MAPPERS
]
rows = []
for target_id, metrics in compile_metrics_map.items():
row = [metrics.name]
for mapper in COMPILATION_METRICS_TO_TABLE_MAPPERS:
current, base = mapper.get_current_and_base_value(metrics)
row.append(
_make_series_link(_get_compare_text(current, base),
mapper.get_series_id(target_id)))
rows.append(tuple(row))
tables.append(
_add_header_and_get_markdown_table(headers, rows, size_cut=size_cut))
return "\n\n".join(tables)