| # Copyright 2024 The IREE Authors |
| # |
| # Licensed under the Apache License v2.0 with LLVM Exceptions. |
| # See https://llvm.org/LICENSE.txt for license information. |
| # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| import os |
| from collections import namedtuple |
| import logging |
| from typing import Sequence |
| import subprocess |
| import json |
| from pathlib import Path |
| import tabulate |
| from pytest_check import check |
| |
| vmfb_dir = os.getenv("TEST_OUTPUT_ARTIFACTS", default=Path.cwd()) |
| benchmark_dir = os.path.dirname(os.path.realpath(__file__)) |
| artifacts_dir = f"{os.getenv('IREE_TEST_FILES', default=Path.cwd())}/artifacts" |
| artifacts_dir = Path(os.path.expanduser(artifacts_dir)).resolve() |
| prompt_encoder_dir = f"{artifacts_dir}/sdxl_clip" |
| scheduled_unet_dir = f"{artifacts_dir}/sdxl_unet_fp16" |
| punet_int8_fp16_dir = f"{artifacts_dir}/sdxl_punet_int8_fp16" |
| punet_int8_fp8_dir = f"{artifacts_dir}/sdxl_punet_int8_fp8" |
| vae_decode_dir = f"{artifacts_dir}/sdxl_vae" |
| prompt_encoder_dir_compile = f"{vmfb_dir}/sdxl_clip_vmfbs" |
| scheduled_unet_dir_compile = f"{vmfb_dir}/sdxl_unet_fp16_vmfbs" |
| punet_int8_fp16_dir_compile = f"{vmfb_dir}/sdxl_punet_int8_fp16_vmfbs" |
| punet_int8_fp8_dir_compile = f"{vmfb_dir}/sdxl_punet_int8_fp8_vmfbs" |
| vae_decode_dir_compile = f"{vmfb_dir}/sdxl_vae_vmfbs" |
| |
| |
| def run_iree_command(args: Sequence[str] = ()): |
| command = "Exec:", " ".join(args) |
| logging.getLogger().info(command) |
| proc = subprocess.run( |
| args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False |
| ) |
| ( |
| stdout_v, |
| stderr_v, |
| ) = ( |
| proc.stdout, |
| proc.stderr, |
| ) |
| return_code = proc.returncode |
| if return_code == 0: |
| return 0, proc.stdout |
| logging.getLogger().error( |
| f"Command failed!\n" |
| f"Stderr diagnostics:\n{proc.stderr}\n" |
| f"Stdout diagnostics:\n{proc.stdout}\n" |
| ) |
| return 1, proc.stdout |
| |
| |
| def run_sdxl_rocm_benchmark(rocm_chip): |
| exec_args = [ |
| "iree-compile", |
| f"{benchmark_dir}/sdxl_pipeline_bench_f16.mlir", |
| "--iree-hal-target-backends=rocm", |
| f"--iree-hip-target={rocm_chip}", |
| "--iree-global-opt-propagate-transposes=true", |
| "--iree-codegen-llvmgpu-use-vector-distribution", |
| "--iree-codegen-gpu-native-math-precision=true", |
| "--iree-hip-waves-per-eu=2", |
| "--iree-opt-outer-dim-concat=true", |
| "--iree-llvmgpu-enable-prefetch", |
| "-o", |
| f"{benchmark_dir}/sdxl_full_pipeline_fp16_rocm.vmfb", |
| ] |
| # iree compile command for full sdxl pipeline |
| ret_value, stdout = run_iree_command(exec_args) |
| if ret_value == 1: |
| return 1, stdout |
| exec_args = [ |
| "iree-benchmark-module", |
| f"--device=hip", |
| "--device_allocator=caching", |
| f"--module={prompt_encoder_dir_compile}/model.rocm_{rocm_chip}.vmfb", |
| f"--parameters=model={prompt_encoder_dir}/real_weights.irpa", |
| f"--module={scheduled_unet_dir_compile}/model.rocm_{rocm_chip}.vmfb", |
| f"--parameters=model={scheduled_unet_dir}/real_weights.irpa", |
| f"--module={vae_decode_dir_compile}/model.rocm_{rocm_chip}.vmfb", |
| f"--parameters=model={vae_decode_dir}/real_weights.irpa", |
| f"--module={benchmark_dir}/sdxl_full_pipeline_fp16_rocm.vmfb", |
| "--function=tokens_to_image", |
| "--input=1x4x128x128xf16", |
| "--input=1xf16", |
| "--input=1x64xi64", |
| "--input=1x64xi64", |
| "--input=1x64xi64", |
| "--input=1x64xi64", |
| "--benchmark_repetitions=10", |
| "--benchmark_min_warmup_time=3.0", |
| ] |
| # iree benchmark command for full sdxl pipeline |
| return run_iree_command(exec_args) |
| |
| |
| def run_sdxl_unet_rocm_benchmark(rocm_chip): |
| exec_args = [ |
| "iree-benchmark-module", |
| f"--device=hip", |
| "--device_allocator=caching", |
| f"--module={scheduled_unet_dir_compile}/model.rocm_{rocm_chip}.vmfb", |
| f"--parameters=model={scheduled_unet_dir}/real_weights.irpa", |
| "--function=run_forward", |
| "--input=1x4x128x128xf16", |
| "--input=2x64x2048xf16", |
| "--input=2x1280xf16", |
| "--input=2x6xf16", |
| "--input=1xf16", |
| "--input=1xi64", |
| "--benchmark_repetitions=10", |
| "--benchmark_min_warmup_time=3.0", |
| ] |
| # iree benchmark command for full sdxl pipeline |
| return run_iree_command(exec_args) |
| |
| |
| def run_sdxl_punet_int8_fp16_rocm_benchmark(rocm_chip): |
| exec_args = [ |
| "iree-benchmark-module", |
| f"--device=hip", |
| "--device_allocator=caching", |
| f"--module={punet_int8_fp16_dir_compile}/punet_fp16.rocm_{rocm_chip}.vmfb", |
| f"--parameters=model={punet_int8_fp16_dir}/punet_weights.irpa", |
| "--function=main", |
| f"--input=1x4x128x128xf16", |
| f"--input=1xf16", |
| f"--input=2x64x2048xf16", |
| f"--input=2x1280xf16", |
| f"--input=2x6xf16", |
| f"--input=1xf16", |
| "--benchmark_repetitions=10", |
| "--benchmark_min_warmup_time=3.0", |
| ] |
| # iree benchmark command for full sdxl pipeline |
| return run_iree_command(exec_args) |
| |
| |
| def run_sdxl_punet_int8_fp8_rocm_benchmark(rocm_chip): |
| exec_args = [ |
| "iree-benchmark-module", |
| f"--device=hip", |
| "--device_allocator=caching", |
| f"--module={punet_int8_fp8_dir_compile}/punet_fp8.rocm_{rocm_chip}.vmfb", |
| f"--parameters=model={punet_int8_fp8_dir}/punet_fp8_weights.irpa", |
| "--function=main", |
| f"--input=1x4x128x128xf16", |
| f"--input=1xf16", |
| f"--input=2x64x2048xf16", |
| f"--input=2x1280xf16", |
| f"--input=2x6xf16", |
| f"--input=1xf16", |
| "--benchmark_repetitions=10", |
| "--benchmark_min_warmup_time=3.0", |
| ] |
| # iree benchmark command for full sdxl pipeline |
| return run_iree_command(exec_args) |
| |
| |
| def run_sdxl_prompt_encoder_rocm_benchmark(rocm_chip): |
| exec_args = [ |
| "iree-benchmark-module", |
| f"--device=hip", |
| "--device_allocator=caching", |
| f"--module={prompt_encoder_dir_compile}/model.rocm_{rocm_chip}.vmfb", |
| f"--parameters=model={prompt_encoder_dir}/real_weights.irpa", |
| "--function=encode_prompts", |
| "--input=1x64xi64", |
| "--input=1x64xi64", |
| "--input=1x64xi64", |
| "--input=1x64xi64", |
| "--benchmark_repetitions=10", |
| "--benchmark_min_warmup_time=3.0", |
| ] |
| # iree benchmark command for full sdxl pipeline |
| return run_iree_command(exec_args) |
| |
| |
| def run_sdxl_vae_decode_rocm_benchmark(rocm_chip): |
| exec_args = [ |
| "iree-benchmark-module", |
| f"--device=hip", |
| "--device_allocator=caching", |
| f"--module={vae_decode_dir_compile}/model.rocm_{rocm_chip}.vmfb", |
| f"--parameters=model={vae_decode_dir}/real_weights.irpa", |
| "--function=main", |
| "--input=1x4x128x128xf16", |
| "--benchmark_repetitions=10", |
| "--benchmark_min_warmup_time=3.0", |
| ] |
| # iree benchmark command for full sdxl pipeline |
| return run_iree_command(exec_args) |
| |
| |
| BenchmarkResult = namedtuple( |
| "BenchmarkResult", "benchmark_name time cpu_time iterations user_counters" |
| ) |
| |
| |
| def decode_output(bench_lines): |
| benchmark_results = [] |
| for line in bench_lines: |
| split = line.split() |
| if len(split) == 0: |
| continue |
| benchmark_name = split[0] |
| time = " ".join(split[1:3]) |
| cpu_time = " ".join(split[3:5]) |
| iterations = split[5] |
| user_counters = None |
| if len(split) > 5: |
| user_counters = split[6] |
| benchmark_results.append( |
| BenchmarkResult( |
| benchmark_name=benchmark_name, |
| time=time, |
| cpu_time=cpu_time, |
| iterations=iterations, |
| user_counters=user_counters, |
| ) |
| ) |
| return benchmark_results |
| |
| |
| def job_summary_process(ret_value, output): |
| if ret_value == 1: |
| # Output should have already been logged earlier. |
| logging.getLogger().error("Running SDXL ROCm benchmark failed. Exiting.") |
| return |
| |
| bench_lines = output.decode().split("\n")[3:] |
| benchmark_results = decode_output(bench_lines) |
| logging.getLogger().info(benchmark_results) |
| benchmark_mean_time = float(benchmark_results[10].time.split()[0]) |
| return benchmark_mean_time |
| |
| |
| def test_sdxl_rocm_benchmark( |
| goldentime_rocm_e2e, |
| goldentime_rocm_unet, |
| goldentime_rocm_punet_int8_fp16, |
| goldentime_rocm_punet_int8_fp8, |
| goldentime_rocm_clip, |
| goldentime_rocm_vae, |
| rocm_chip, |
| goldendispatch_rocm_unet, |
| goldendispatch_rocm_punet_int8_fp16, |
| goldendispatch_rocm_punet_int8_fp8, |
| goldendispatch_rocm_clip, |
| goldendispatch_rocm_vae, |
| goldensize_rocm_unet, |
| goldensize_rocm_punet_int8_fp16, |
| goldensize_rocm_punet_int8_fp8, |
| goldensize_rocm_clip, |
| goldensize_rocm_vae, |
| ): |
| # e2e benchmark |
| ret_value, output = run_sdxl_rocm_benchmark(rocm_chip) |
| benchmark_e2e_mean_time = job_summary_process(ret_value, output) |
| mean_line = ( |
| f"E2E Benchmark Time: {str(benchmark_e2e_mean_time)} ms" |
| f" (golden time {goldentime_rocm_e2e} ms)" |
| ) |
| logging.getLogger().info(mean_line) |
| |
| # unet benchmark |
| ret_value, output = run_sdxl_unet_rocm_benchmark(rocm_chip) |
| benchmark_unet_mean_time = job_summary_process(ret_value, output) |
| mean_line = ( |
| f"Scheduled Unet Benchmark Time: {str(benchmark_unet_mean_time)} ms" |
| f" (golden time {goldentime_rocm_unet} ms)" |
| ) |
| logging.getLogger().info(mean_line) |
| |
| # unet compilation stats check |
| with open(f"{scheduled_unet_dir_compile}/compilation_info.json", "r") as file: |
| comp_stats = json.load(file) |
| unet_dispatch_count = int( |
| comp_stats["stream-aggregate"]["execution"]["dispatch-count"] |
| ) |
| compilation_line = ( |
| f"Scheduled Unet Dispatch Count: {unet_dispatch_count}" |
| f" (golden dispatch count {goldendispatch_rocm_unet})" |
| ) |
| logging.getLogger().info(compilation_line) |
| |
| module_path = f"{scheduled_unet_dir_compile}/model.rocm_{rocm_chip}.vmfb" |
| unet_binary_size = Path(module_path).stat().st_size |
| compilation_line = ( |
| f"Scheduled Unet Binary Size: {unet_binary_size} bytes" |
| f" (golden binary size {goldensize_rocm_unet} bytes)" |
| ) |
| logging.getLogger().info(compilation_line) |
| |
| if rocm_chip == "gfx942": |
| # punet int8 f16 attention benchmark |
| ret_value, output = run_sdxl_punet_int8_fp16_rocm_benchmark(rocm_chip) |
| benchmark_punet_int8_fp16_mean_time = job_summary_process(ret_value, output) |
| mean_line = ( |
| f"Punet F16 Benchmark Time: {str(benchmark_punet_int8_fp16_mean_time)} ms" |
| f" (golden time {goldentime_rocm_punet_int8_fp16} ms)" |
| ) |
| logging.getLogger().info(mean_line) |
| |
| # punet int8 f16 compilation stats check |
| with open(f"{punet_int8_fp16_dir_compile}/compilation_info.json", "r") as file: |
| comp_stats = json.load(file) |
| punet_int8_fp16_dispatch_count = int( |
| comp_stats["stream-aggregate"]["execution"]["dispatch-count"] |
| ) |
| compilation_line = ( |
| f"Punet F16 Dispatch Count: {punet_int8_fp16_dispatch_count}" |
| f" (golden dispatch count {goldendispatch_rocm_punet_int8_fp16})" |
| ) |
| logging.getLogger().info(compilation_line) |
| |
| module_path = f"{punet_int8_fp16_dir_compile}/punet_fp16.rocm_{rocm_chip}.vmfb" |
| punet_int8_fp16_binary_size = Path(module_path).stat().st_size |
| compilation_line = ( |
| f"Punet F16 Binary Size: {punet_int8_fp16_binary_size} bytes" |
| f" (golden binary size {goldensize_rocm_punet_int8_fp16} bytes)" |
| ) |
| logging.getLogger().info(compilation_line) |
| |
| # punet int8 f8 attention benchmark |
| ret_value, output = run_sdxl_punet_int8_fp8_rocm_benchmark(rocm_chip) |
| benchmark_punet_int8_fp8_mean_time = job_summary_process(ret_value, output) |
| mean_line = ( |
| f"Punet F8 Benchmark Time: {str(benchmark_punet_int8_fp8_mean_time)} ms" |
| f" (golden time {goldentime_rocm_punet_int8_fp8} ms)" |
| ) |
| logging.getLogger().info(mean_line) |
| |
| # punet int8 f8 compilation stats check |
| with open(f"{punet_int8_fp8_dir_compile}/compilation_info.json", "r") as file: |
| comp_stats = json.load(file) |
| punet_int8_fp8_dispatch_count = int( |
| comp_stats["stream-aggregate"]["execution"]["dispatch-count"] |
| ) |
| compilation_line = ( |
| f"Punet F8 Dispatch Count: {punet_int8_fp8_dispatch_count}" |
| f" (golden dispatch count {goldendispatch_rocm_punet_int8_fp8})" |
| ) |
| logging.getLogger().info(compilation_line) |
| |
| module_path = f"{punet_int8_fp8_dir_compile}/punet_fp8.rocm_{rocm_chip}.vmfb" |
| punet_int8_fp8_binary_size = Path(module_path).stat().st_size |
| compilation_line = ( |
| f"Punet F8 Binary Size: {punet_int8_fp8_binary_size} bytes" |
| f" (golden binary size {goldensize_rocm_punet_int8_fp8} bytes)" |
| ) |
| logging.getLogger().info(compilation_line) |
| |
| # prompt encoder benchmark |
| ret_value, output = run_sdxl_prompt_encoder_rocm_benchmark(rocm_chip) |
| benchmark_clip_mean_time = job_summary_process(ret_value, output) |
| mean_line = ( |
| f"Prompt Encoder Benchmark Time: {str(benchmark_clip_mean_time)} ms" |
| f" (golden time {goldentime_rocm_clip} ms)" |
| ) |
| logging.getLogger().info(mean_line) |
| |
| # prompt encoder compilation stats check |
| with open(f"{prompt_encoder_dir_compile}/compilation_info.json", "r") as file: |
| comp_stats = json.load(file) |
| clip_dispatch_count = int( |
| comp_stats["stream-aggregate"]["execution"]["dispatch-count"] |
| ) |
| compilation_line = ( |
| f"Prompt Encoder Dispatch Count: {clip_dispatch_count}" |
| f" (golden dispatch count {goldendispatch_rocm_clip})" |
| ) |
| logging.getLogger().info(compilation_line) |
| |
| module_path = f"{prompt_encoder_dir_compile}/model.rocm_{rocm_chip}.vmfb" |
| clip_binary_size = Path(module_path).stat().st_size |
| compilation_line = ( |
| f"Prompt Encoder Binary Size: {clip_binary_size} bytes" |
| f" (golden binary size {goldensize_rocm_clip} bytes)" |
| ) |
| logging.getLogger().info(compilation_line) |
| |
| # vae decode benchmark |
| ret_value, output = run_sdxl_vae_decode_rocm_benchmark(rocm_chip) |
| benchmark_vae_mean_time = job_summary_process(ret_value, output) |
| mean_line = ( |
| f"VAE Decode Benchmark Time: {str(benchmark_vae_mean_time)} ms" |
| f" (golden time {goldentime_rocm_vae} ms)" |
| ) |
| logging.getLogger().info(mean_line) |
| |
| # vae decode compilation stats check |
| with open(f"{vae_decode_dir_compile}/compilation_info.json", "r") as file: |
| comp_stats = json.load(file) |
| vae_dispatch_count = int( |
| comp_stats["stream-aggregate"]["execution"]["dispatch-count"] |
| ) |
| compilation_line = ( |
| f"VAE Decode Dispatch Count: {vae_dispatch_count}" |
| f" (golden dispatch count {goldendispatch_rocm_vae})" |
| ) |
| logging.getLogger().info(compilation_line) |
| |
| module_path = f"{vae_decode_dir_compile}/model.rocm_{rocm_chip}.vmfb" |
| vae_binary_size = Path(module_path).stat().st_size |
| compilation_line = ( |
| f"VAE Decode Binary Size: {vae_binary_size} bytes" |
| f" (golden binary size {goldensize_rocm_vae} bytes)" |
| ) |
| logging.getLogger().info(compilation_line) |
| |
| # Create mean time table's header and rows |
| mean_time_header = ["Benchmark", "Current time (ms)", "Expected/golden time (ms)"] |
| mean_time_rows = [ |
| ["E2E†", f"{benchmark_e2e_mean_time}", f"{goldentime_rocm_e2e}"], |
| ["Scheduled Unet", f"{benchmark_unet_mean_time}", f"{goldentime_rocm_unet}"], |
| ["Prompt Encoder", f"{benchmark_clip_mean_time}", f"{goldentime_rocm_clip}"], |
| ["VAE Decode", f"{benchmark_vae_mean_time}", f"{goldentime_rocm_vae}"], |
| ] |
| if rocm_chip == "gfx942": |
| mean_time_rows.append( |
| [ |
| "Punet F16", |
| f"{benchmark_punet_int8_fp16_mean_time}", |
| f"{goldentime_rocm_punet_int8_fp16}", |
| ] |
| ) |
| mean_time_rows.append( |
| [ |
| "Punet F8", |
| f"{benchmark_punet_int8_fp8_mean_time}", |
| f"{goldentime_rocm_punet_int8_fp8}", |
| ] |
| ) |
| |
| # Create dispatch count table's header and rows |
| dispatch_count_header = [ |
| "Benchmark", |
| "Current dispatch count", |
| "Expected/golden dispatch count", |
| ] |
| dispatch_count_rows = [ |
| ["Scheduled Unet", f"{unet_dispatch_count}", f"{goldendispatch_rocm_unet}"], |
| ["Prompt Encoder", f"{clip_dispatch_count}", f"{goldendispatch_rocm_clip}"], |
| ["VAE Decode", f"{vae_dispatch_count}", f"{goldendispatch_rocm_vae}"], |
| ] |
| if rocm_chip == "gfx942": |
| dispatch_count_rows.append( |
| [ |
| "Punet F16", |
| f"{punet_int8_fp16_dispatch_count}", |
| f"{goldendispatch_rocm_punet_int8_fp16}", |
| ] |
| ) |
| dispatch_count_rows.append( |
| [ |
| "Punet F8", |
| f"{punet_int8_fp8_dispatch_count}", |
| f"{goldendispatch_rocm_punet_int8_fp8}", |
| ] |
| ) |
| |
| # Create binary size table's header and rows |
| binary_size_header = [ |
| "Benchmark", |
| "Current binary size (bytes)", |
| "Expected/golden binary size (bytes)", |
| ] |
| binary_size_rows = [ |
| ["Scheduled Unet", f"{unet_binary_size}", f"{goldensize_rocm_unet}"], |
| ["Prompt Encoder", f"{clip_binary_size}", f"{goldensize_rocm_clip}"], |
| ["VAE Decode", f"{vae_binary_size}", f"{goldensize_rocm_vae}"], |
| ] |
| if rocm_chip == "gfx942": |
| binary_size_rows.append( |
| [ |
| "Punet F16", |
| f"{punet_int8_fp16_binary_size}", |
| f"{goldensize_rocm_punet_int8_fp16}", |
| ] |
| ) |
| binary_size_rows.append( |
| [ |
| "Punet F8", |
| f"{punet_int8_fp8_binary_size}", |
| f"{goldensize_rocm_punet_int8_fp8}", |
| ] |
| ) |
| |
| # Create mean time table using tabulate |
| mean_time_full = [mean_time_header] + mean_time_rows |
| mean_time_table = tabulate.tabulate( |
| mean_time_full, headers="firstrow", tablefmt="pipe" |
| ) |
| |
| # Create dispatch count table using tabulate |
| dispatch_count_full = [dispatch_count_header] + dispatch_count_rows |
| dispatch_count_table = tabulate.tabulate( |
| dispatch_count_full, headers="firstrow", tablefmt="pipe" |
| ) |
| |
| # Create binary size of compiled artifacts table using tabulate |
| binary_size_full = [binary_size_header] + binary_size_rows |
| binary_size_table = tabulate.tabulate( |
| binary_size_full, headers="firstrow", tablefmt="pipe" |
| ) |
| |
| # Write markdown tables to job summary file |
| with open("job_summary.md", "w") as job_summary: |
| print("SDXL Benchmark Summary:\n", file=job_summary) |
| print(mean_time_table, file=job_summary) |
| print("\n† E2E = Encode + Scheduled Unet * 3 + Decode\n", file=job_summary) |
| print(dispatch_count_table, file=job_summary) |
| print("\n", file=job_summary) |
| print(binary_size_table, file=job_summary) |
| |
| # Check all values are either <= than golden values for times and == for compilation statistics. |
| |
| check.less_equal( |
| benchmark_e2e_mean_time, |
| goldentime_rocm_e2e, |
| "SDXL e2e benchmark time should not regress", |
| ) |
| check.less_equal( |
| benchmark_unet_mean_time, |
| goldentime_rocm_unet, |
| "SDXL unet benchmark time should not regress", |
| ) |
| check.less_equal( |
| unet_dispatch_count, |
| goldendispatch_rocm_unet, |
| "SDXL scheduled unet dispatch count should not regress", |
| ) |
| check.less_equal( |
| unet_binary_size, |
| goldensize_rocm_unet, |
| "SDXL scheduled unet binary size should not get bigger", |
| ) |
| if rocm_chip == "gfx942": |
| check.less_equal( |
| benchmark_punet_int8_fp16_mean_time, |
| goldentime_rocm_punet_int8_fp16, |
| "SDXL punet f16 benchmark time should not regress", |
| ) |
| check.less_equal( |
| punet_int8_fp16_dispatch_count, |
| goldendispatch_rocm_punet_int8_fp16, |
| "SDXL punet f16 dispatch count should not regress", |
| ) |
| check.less_equal( |
| punet_int8_fp16_binary_size, |
| goldensize_rocm_punet_int8_fp16, |
| "SDXL punet f16 binary size should not get bigger", |
| ) |
| check.less_equal( |
| benchmark_punet_int8_fp8_mean_time, |
| goldentime_rocm_punet_int8_fp8, |
| "SDXL punet f8 benchmark time should not regress", |
| ) |
| check.less_equal( |
| punet_int8_fp8_dispatch_count, |
| goldendispatch_rocm_punet_int8_fp8, |
| "SDXL punet f8 dispatch count should not regress", |
| ) |
| check.less_equal( |
| punet_int8_fp8_binary_size, |
| goldensize_rocm_punet_int8_fp8, |
| "SDXL punet f8 binary size should not get bigger", |
| ) |
| check.less_equal( |
| benchmark_clip_mean_time, |
| goldentime_rocm_clip, |
| "SDXL prompt encoder benchmark time should not regress", |
| ) |
| check.less_equal( |
| clip_dispatch_count, |
| goldendispatch_rocm_clip, |
| "SDXL prompt encoder dispatch count should not regress", |
| ) |
| check.less_equal( |
| clip_binary_size, |
| goldensize_rocm_clip, |
| "SDXL prompt encoder binary size should not get bigger", |
| ) |
| check.less_equal( |
| benchmark_vae_mean_time, |
| goldentime_rocm_vae, |
| "SDXL vae decode benchmark time should not regress", |
| ) |
| check.less_equal( |
| vae_dispatch_count, |
| goldendispatch_rocm_vae, |
| "SDXL vae decode dispatch count should not regress", |
| ) |
| check.less_equal( |
| vae_binary_size, |
| goldensize_rocm_vae, |
| "SDXL vae decode binary size should not get bigger", |
| ) |