| #!/usr/bin/env python3 |
| # Copyright 2021 The IREE Authors |
| # |
| # Licensed under the Apache License v2.0 with LLVM Exceptions. |
| # See https://llvm.org/LICENSE.txt for license information. |
| # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| """Posts benchmark results to GitHub as pull request comments. |
| |
| This script is meant to be used by Buildkite for automation. It requires the |
| following environment to be set: |
| |
| - BUILDKITE_BUILD_NUMBER: the build number of current Buildkite build. |
| - BUILDKITE_BUILD_URL: the link to the current Buildkite build. |
| - BUILDKITE_COMMIT: the pull request HEAD commit. |
| - BUILDKITE_PULL_REQUEST: the current pull request number. |
| - GITHUB_TOKEN: personal access token to authenticate against GitHub API; |
| it should have "public_repo" and "gist" scope. |
| |
| if --query-base in toggled on, then it additionally requires: |
| |
| - BUILDKITE_PULL_REQUEST_BASE_BRANCH: the targeting base branch. |
| - IREE_DASHBOARD_URL: the url to IREE's performance dashboard. |
| |
| This script uses pip package "markdown_strings". |
| |
| Example usage: |
| # Export necessary environment variables: |
| export ... |
| # Then run the script: |
| python3 post_benchmarks_as_pr_comment.py <benchmark-json-file>... |
| # where each <benchmark-json-file> is expected to be of format expected |
| # by BenchmarkResults objects. |
| """ |
| |
| import argparse |
| import json |
| import os |
| import requests |
| |
| import urllib.parse |
| import markdown_strings as md |
| |
| from dataclasses import dataclass |
| from typing import Any, Dict, Optional, Sequence, Tuple, Union |
| |
| from common.benchmark_definition import BenchmarkResults, execute_cmd_and_get_output |
| from common.benchmark_thresholds import BENCHMARK_THRESHOLDS, ThresholdUnit |
| |
| ABBR_PR_COMMENT_TITLE = "Abbreviated Benchmark Summary" |
| GITHUB_GIST_API_PREFIX = "https://api.github.com/gists" |
| GITHUB_IREE_API_PREFIX = "https://api.github.com/repos/google/iree" |
| GITHUB_IREE_REPO_PREFIX = "https://github.com/google/iree" |
| GITHUB_USER = "iree-github-actions-bot" |
| IREE_PROJECT_ID = 'IREE' |
| # The maximal numbers of trials when querying base commit benchmark results. |
| MAX_BASE_COMMIT_QUERY_COUNT = 10 |
| PERFBOARD_SERIES_PREFIX = "https://perf.iree.dev/serie?IREE?" |
| # The max number of rows to show per table. |
| TABLE_SIZE_CUT = 3 |
| THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) |
| |
| |
| def get_required_env_var(var: str) -> str: |
| """Gets the value for a required environment variable.""" |
| value = os.getenv(var, None) |
| if value is None: |
| raise RuntimeError(f'Missing environment variable "{var}"') |
| return value |
| |
| |
| def get_git_commit_hash(commit: str, verbose: bool = False) -> str: |
| """Gets the commit hash for the given commit.""" |
| return execute_cmd_and_get_output(['git', 'rev-parse', commit], |
| cwd=THIS_DIRECTORY, |
| verbose=verbose) |
| |
| |
| def get_git_total_commit_count(commit: str, verbose: bool = False) -> int: |
| """Gets the total commit count in history ending with the given commit.""" |
| count = execute_cmd_and_get_output(['git', 'rev-list', '--count', commit], |
| cwd=THIS_DIRECTORY, |
| verbose=verbose) |
| return int(count) |
| |
| |
| def get_origin_tree_commit(distance: int, verbose: bool = False) -> str: |
| """Returns the hash for the commit with the given distance from top of the |
| tree for the origin base branch.""" |
| base_branch = get_required_env_var("BUILDKITE_PULL_REQUEST_BASE_BRANCH") |
| execute_cmd_and_get_output( |
| ['git', 'fetch', '--prune', '--', 'origin', base_branch], |
| cwd=THIS_DIRECTORY, |
| verbose=verbose) |
| return get_git_commit_hash(f'origin/{base_branch}~{distance}', verbose) |
| |
| |
| def get_from_dashboard(url: str, |
| payload: Dict[str, Any], |
| verbose: bool = False) -> Dict[str, int]: |
| headers = {'Content-type': 'application/json'} |
| data = json.dumps(payload) |
| |
| if verbose: |
| print(f'API request payload: {data}') |
| |
| response = requests.get(url, data=data, headers=headers) |
| code = response.status_code |
| if code != 200: |
| raise requests.RequestException( |
| f'Failed to get from dashboard server with status code {code}') |
| |
| data = response.json() |
| if verbose: |
| print(f'Queried base benchmark data: {data}') |
| return data |
| |
| |
| @dataclass |
| class AggregateBenchmarkLatency: |
| """An object for describing aggregate latency numbers for a benchmark.""" |
| mean_time: int |
| median_time: int |
| stddev_time: int |
| # The average latency time for the base commit to compare against. |
| base_mean_time: Optional[int] = None |
| |
| |
| def aggregate_all_benchmarks( |
| benchmark_files: Sequence[str], |
| verbose: bool = False) -> Dict[str, AggregateBenchmarkLatency]: |
| """Aggregates all benchmarks in the given files. |
| |
| Args: |
| - benchmark_files: A list of JSON files, each can be decoded as a |
| BenchmarkResults. |
| |
| Returns: |
| - A dict of benchmark names to AggregateBenchmarkLatency numbers. |
| """ |
| |
| pr_commit = get_required_env_var("BUILDKITE_COMMIT") |
| aggregate_results = {} |
| |
| for benchmark_file in benchmark_files: |
| with open(benchmark_file) as f: |
| content = f.read() |
| file_results = BenchmarkResults.from_json_str(content) |
| |
| if file_results.commit != pr_commit: |
| raise ValueError("Inconsistent pull request commit") |
| |
| for benchmark_index in range(len(file_results.benchmarks)): |
| benchmark_case = file_results.benchmarks[benchmark_index] |
| |
| # Make sure each benchmark has a unique name. |
| name = str(benchmark_case["benchmark"]) |
| if name in aggregate_results: |
| raise ValueError(f"Duplicated benchmarks: {name}") |
| |
| # Now scan all benchmark iterations and find the aggregate results. |
| mean_time = file_results.get_aggregate_time(benchmark_index, "mean") |
| median_time = file_results.get_aggregate_time(benchmark_index, "median") |
| stddev_time = file_results.get_aggregate_time(benchmark_index, "stddev") |
| |
| aggregate_results[name] = AggregateBenchmarkLatency( |
| mean_time, median_time, stddev_time) |
| |
| return aggregate_results |
| |
| |
| def query_base_benchmark_results(commit, |
| verbose: bool = False) -> Dict[str, int]: |
| """Queries the benchmark results for the given commit.""" |
| build_id = get_git_total_commit_count(commit, verbose) |
| |
| url = get_required_env_var('IREE_DASHBOARD_URL') |
| payload = {'projectId': IREE_PROJECT_ID, 'buildId': build_id} |
| return get_from_dashboard(f'{url}/apis/getBuild', payload, verbose=verbose) |
| |
| |
| def make_benchmark_clickable(name: str) -> str: |
| """Add link to the given benchmark name.""" |
| url = PERFBOARD_SERIES_PREFIX + urllib.parse.quote(name, safe="()[]@,") |
| return md.link(name, url) |
| |
| |
| def add_header_and_get_markdown_table(names: Tuple[str], |
| means: Tuple[Any], |
| medians: Tuple[int], |
| stddevs: Tuple[int], |
| size_cut: Optional[int] = None) -> str: |
| """Generates a markdown table with proper headers for benchmarks. |
| |
| Args: |
| - size_cut: If not None, only show the top N results for each table. |
| """ |
| total_size = len(names) |
| if size_cut is not None: |
| names = names[0:size_cut] |
| means = means[0:size_cut] |
| medians = medians[0:size_cut] |
| stddevs = stddevs[0:size_cut] |
| |
| names = tuple([make_benchmark_clickable(name) for name in names]) |
| names = ("Benchmark Name",) + names |
| means = ("Average Latency (ms)",) + means |
| medians = ("Median Latency (ms)",) + medians |
| stddevs = ("Latency Standard Deviation (ms)",) + stddevs |
| |
| table_str = md.table([names, means, medians, stddevs]) |
| if size_cut is not None and size_cut < total_size: |
| table_str += "\n\n" |
| table_str += md.italics( |
| f"[Top {size_cut} out of {total_size} benchmark results showed]") |
| return table_str |
| |
| |
| def sort_benchmarks_and_get_table(benchmarks: Dict[str, |
| AggregateBenchmarkLatency], |
| size_cut: Optional[int] = None): |
| """Sorts all benchmarks according to the improvement/regression ratio and |
| returns a markdown table for it. |
| |
| Args: |
| - size_cut: If not None, only show the top N results for each table. |
| """ |
| sorted_benchmarks = [] |
| for k, v in benchmarks.items(): |
| ratio = abs(v.mean_time - v.base_mean_time) / v.base_mean_time |
| sorted_benchmarks.append((k, (v.mean_time, v.base_mean_time, ratio), |
| v.median_time, v.stddev_time)) |
| # Sort according to ratio in the reverse order. |
| sorted_benchmarks.sort(key=lambda benchmark: benchmark[1][2], reverse=True) |
| |
| # Split each field into its own tuple in prepration for markdown table. |
| names, means, medians, stddevs = zip(*sorted_benchmarks) |
| |
| # Turn the tuple about means into a string representation. |
| str_means = [] |
| for pr, base, ratio in means: |
| direction = "↑" if pr > base else ("↓" if pr < base else "") |
| str_means.append(f"{pr} (vs. {base}, {ratio:.2%}{direction})") |
| str_means = tuple(str_means) |
| |
| return add_header_and_get_markdown_table(names, str_means, medians, stddevs, |
| size_cut) |
| |
| |
| def categorize_benchmarks_into_tables(benchmarks: Dict[ |
| str, AggregateBenchmarkLatency], |
| size_cut: Optional[int] = None) -> str: |
| """Splits benchmarks into regressed/improved/similar/raw categories and |
| returns their markdown tables. |
| |
| Args: |
| - benchmarks: A dictionary of benchmark names to its aggregate info. |
| - size_cut: If not None, only show the top N results for each table. |
| """ |
| regressed, improved, similar, raw = {}, {}, {}, {} |
| |
| for name, results in benchmarks.items(): |
| # If no informatio about the base result. Then we cannot analyze. |
| if results.base_mean_time is None: |
| raw[name] = results |
| continue |
| |
| similar_threshold = None |
| for threshold in BENCHMARK_THRESHOLDS: |
| if threshold.regex.match(name): |
| similar_threshold = threshold |
| break |
| if similar_threshold is None: |
| raise ValueError(f"no matched threshold setting for benchmark: {name}") |
| |
| current = results.mean_time |
| base = results.base_mean_time |
| if similar_threshold.unit == ThresholdUnit.PERCENTAGE: |
| ratio = abs(current - base) / base |
| else: |
| ratio = abs(current - base) |
| |
| if ratio <= similar_threshold.threshold: |
| similar[name] = results |
| elif current > base: |
| regressed[name] = results |
| else: |
| improved[name] = results |
| |
| tables = [] |
| if regressed: |
| tables.append(md.header("Regressed Benchmarks 🚩", 3)) |
| tables.append(sort_benchmarks_and_get_table(regressed, size_cut)) |
| if improved: |
| tables.append(md.header("Improved Benchmarks 🎉", 3)) |
| tables.append(sort_benchmarks_and_get_table(improved, size_cut)) |
| # If we want to abbreviate, similar results won't be interesting. |
| if similar and size_cut is None: |
| tables.append(md.header("Similar Benchmarks", 3)) |
| tables.append(sort_benchmarks_and_get_table(similar, size_cut)) |
| if raw: |
| tables.append(md.header("Raw Benchmarks", 3)) |
| raw_list = [ |
| (k, v.mean_time, v.median_time, v.stddev_time) for k, v in raw.items() |
| ] |
| names, means, medians, stddevs = zip(*raw_list) |
| tables.append( |
| add_header_and_get_markdown_table(names=names, |
| means=means, |
| medians=medians, |
| stddevs=stddevs, |
| size_cut=size_cut)) |
| return "\n\n".join(tables) |
| |
| |
| def get_benchmark_result_markdown(benchmark_files: Sequence[str], |
| query_base: bool, |
| verbose: bool = False) -> Tuple[str, str]: |
| """Gets the full/abbreviated markdown summary of all benchmarks in files.""" |
| all_benchmarks = aggregate_all_benchmarks(benchmark_files, verbose=verbose) |
| |
| build_url = get_required_env_var("BUILDKITE_BUILD_URL") |
| pr_number = get_required_env_var("BUILDKITE_PULL_REQUEST") |
| pr_commit = get_required_env_var("BUILDKITE_COMMIT") |
| pr_commit = md.link(pr_commit, |
| f"{GITHUB_IREE_REPO_PREFIX}/commit/{pr_commit}") |
| |
| commit_info = f"@ commit {pr_commit}" |
| if query_base: |
| # Try to query some base benchmark to diff against, from the top of the |
| # tree. Bail out if the maximal trial number is exceeded. |
| for i in range(MAX_BASE_COMMIT_QUERY_COUNT): |
| base_commit = get_origin_tree_commit(i, verbose) |
| base_benchmarks = query_base_benchmark_results(base_commit, verbose) |
| base_commit = md.link(base_commit, |
| f"{GITHUB_IREE_REPO_PREFIX}/commit/{base_commit}") |
| |
| if len(base_benchmarks) == 0: |
| commit_info = (f"@ commit {pr_commit} (no previous benchmark results to" |
| f" compare against since {base_commit})") |
| continue |
| |
| # Update the aggregate benchmarks with base numbers. |
| for bench in base_benchmarks: |
| if bench in all_benchmarks: |
| all_benchmarks[bench].base_mean_time = base_benchmarks[bench] |
| commit_info = f"@ commit {pr_commit} (vs. base {base_commit})" |
| break |
| |
| pr_info = md.link("Pull request", |
| f"{GITHUB_IREE_REPO_PREFIX}/pull/{pr_number}") |
| buildkite_info = md.link("Buildkite build", build_url) |
| |
| # Compose the full benchmark tables. |
| full_table = [md.header("Full Benchmark Summary", 2)] |
| full_table.append(md.unordered_list([commit_info, pr_info, buildkite_info])) |
| full_table.append(categorize_benchmarks_into_tables(all_benchmarks)) |
| |
| # Compose the abbreviated benchmark tables. |
| abbr_table = [md.header(ABBR_PR_COMMENT_TITLE, 2)] |
| abbr_table.append(commit_info) |
| tables = categorize_benchmarks_into_tables(all_benchmarks, TABLE_SIZE_CUT) |
| if len(tables) == 0: |
| abbr_table.append("No improved or regressed benchmarks 🏖️") |
| else: |
| abbr_table.append(tables) |
| abbr_table.append("For more information:") |
| # We don't know until a Gist is really created. Use a placeholder for now |
| # and replace later. |
| full_result_info = md.link("Full benchmark result tables", |
| "<<placeholder-link>>") |
| abbr_table.append(md.unordered_list([full_result_info, buildkite_info])) |
| |
| return "\n\n".join(full_table), "\n\n".join(abbr_table) |
| |
| |
| def post_to_gist(filename: str, content: str, verbose: bool = False): |
| """Posts the given content to a new GitHub Gist and returns the URL to it.""" |
| api_token = get_required_env_var('GITHUB_TOKEN') |
| headers = { |
| "Accept": "application/vnd.github.v3+json", |
| "Authorization": f"token {api_token}", |
| } |
| payload = json.dumps({ |
| "public": True, |
| "files": { |
| filename: { |
| "content": content |
| } |
| } |
| }) |
| |
| api_endpoint = GITHUB_GIST_API_PREFIX |
| response = requests.post(api_endpoint, data=payload, headers=headers) |
| if response.status_code != 201: |
| raise requests.RequestException( |
| f"Failed to comment on GitHub; error code: {response.status_code}") |
| |
| response = response.json() |
| if verbose: |
| print(f"Gist posting response: {response}") |
| |
| if response["truncated"]: |
| raise requests.RequestException(f"Content too large and gotten truncated") |
| |
| gist_id = response["id"] |
| return f"https://gist.github.com/{GITHUB_USER}/{gist_id}" |
| |
| |
| def get_previous_comment_on_pr(pr_number: str, |
| verbose: bool = False) -> Optional[int]: |
| """Gets the previous comment's ID from GitHub.""" |
| # Increasing per_page limit requires user authentication. |
| api_token = get_required_env_var('GITHUB_TOKEN') |
| headers = { |
| "Accept": "application/vnd.github.v3+json", |
| "Authorization": f"token {api_token}", |
| } |
| payload = json.dumps({"per_page": 100}) |
| |
| api_endpoint = f"{GITHUB_IREE_API_PREFIX}/issues/{pr_number}/comments" |
| response = requests.get(api_endpoint, data=payload, headers=headers) |
| if response.status_code != 200: |
| raise requests.RequestException( |
| f"Failed to get PR comments from GitHub; error code: {response.status_code}" |
| ) |
| |
| response = response.json() |
| if verbose: |
| print(f"Previous comment query response: {response}") |
| |
| # Find the last comment from GITHUB_USER and has the ABBR_PR_COMMENT_TITILE |
| # keyword. |
| for comment in reversed(response): |
| if (comment["user"]["login"] == GITHUB_USER) and (ABBR_PR_COMMENT_TITLE |
| in comment["body"]): |
| return comment["id"] |
| return None |
| |
| |
| def create_comment_on_pr(pr_number: str, content: str, verbose: bool = False): |
| """Posts the given content as comments to the current pull request.""" |
| api_token = get_required_env_var('GITHUB_TOKEN') |
| headers = { |
| "Accept": "application/vnd.github.v3+json", |
| "Authorization": f"token {api_token}", |
| } |
| payload = json.dumps({"body": content}) |
| |
| api_endpoint = f"{GITHUB_IREE_API_PREFIX}/issues/{pr_number}/comments" |
| response = requests.post(api_endpoint, data=payload, headers=headers) |
| if response.status_code != 201: |
| raise requests.RequestException( |
| f"Failed to comment on GitHub; error code: {response.status_code}") |
| |
| |
| def update_comment_on_pr(comment_id: int, content: str, verbose: bool = False): |
| """Updates the content of the given comment.""" |
| api_token = get_required_env_var('GITHUB_TOKEN') |
| headers = { |
| "Accept": "application/vnd.github.v3+json", |
| "Authorization": f"token {api_token}", |
| } |
| payload = json.dumps({"body": content}) |
| |
| api_endpoint = f"{GITHUB_IREE_API_PREFIX}/issues/comments/{comment_id}" |
| response = requests.patch(api_endpoint, data=payload, headers=headers) |
| if response.status_code != 200: |
| raise requests.RequestException( |
| f"Failed to comment on GitHub; error code: {response.status_code}") |
| |
| |
| def parse_arguments(): |
| """Parses command-line options.""" |
| |
| def check_file_path(path): |
| if os.path.isfile(path): |
| return path |
| else: |
| raise ValueError(path) |
| |
| parser = argparse.ArgumentParser() |
| parser.add_argument("benchmark_files", |
| metavar="<benchmark-json-file>", |
| type=check_file_path, |
| nargs="+", |
| help="Path to the JSON file containing benchmark results") |
| parser.add_argument("--dry-run", |
| action="store_true", |
| help="Print the comment instead of posting to GitHub") |
| parser.add_argument( |
| "--query-base", |
| action="store_true", |
| help= |
| "Query the dashboard for the benchmark results of the targeting base branch" |
| ) |
| parser.add_argument("--verbose", |
| action="store_true", |
| help="Print internal information during execution") |
| args = parser.parse_args() |
| |
| return args |
| |
| |
| def main(args): |
| full_md, abbr_md = get_benchmark_result_markdown(args.benchmark_files, |
| query_base=args.query_base, |
| verbose=args.verbose) |
| |
| if args.dry_run: |
| print(full_md, "\n\n", abbr_md) |
| return |
| |
| pr_number = get_required_env_var("BUILDKITE_PULL_REQUEST") |
| # Buildkite sets this to "false" if not running on a PR: |
| # https://buildkite.com/docs/pipelines/environment-variables#bk-env-vars-buildkite-pull-request |
| if pr_number == "false": |
| raise ValueError("Not a pull request") |
| |
| build_number = get_required_env_var("BUILDKITE_BUILD_NUMBER") |
| filename = f"iree-full-benchmark-result-{build_number}.md" |
| gist_url = post_to_gist(filename, full_md, args.verbose) |
| abbr_md = abbr_md.replace("<<placeholder-link>>", gist_url) |
| |
| previous_comment = get_previous_comment_on_pr(pr_number, args.verbose) |
| if previous_comment is not None: |
| update_comment_on_pr(previous_comment, abbr_md, args.verbose) |
| else: |
| create_comment_on_pr(pr_number, abbr_md, args.verbose) |
| |
| |
| if __name__ == "__main__": |
| main(parse_arguments()) |