experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py - 3p/openxla/iree - Git at Google

 # Copyright 2024 The IREE Authors
 #
 # Licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 import os
 from collections import namedtuple
 import logging
 from typing import Sequence
 import subprocess
 import json
 from pathlib import Path
 import tabulate
 from pytest_check import check

 vmfb_dir = os.getenv("TEST_OUTPUT_ARTIFACTS", default=Path.cwd())
 benchmark_dir = os.path.dirname(os.path.realpath(__file__))
 artifacts_dir = f"{os.getenv('IREE_TEST_FILES', default=Path.cwd())}/artifacts"
 artifacts_dir = Path(os.path.expanduser(artifacts_dir)).resolve()
 prompt_encoder_dir = f"{artifacts_dir}/sdxl_clip"
 scheduled_unet_dir = f"{artifacts_dir}/sdxl_unet_fp16"
 punet_int8_fp16_dir = f"{artifacts_dir}/sdxl_punet_int8_fp16"
 punet_int8_fp8_dir = f"{artifacts_dir}/sdxl_punet_int8_fp8"
 vae_decode_dir = f"{artifacts_dir}/sdxl_vae"
 prompt_encoder_dir_compile = f"{vmfb_dir}/sdxl_clip_vmfbs"
 scheduled_unet_dir_compile = f"{vmfb_dir}/sdxl_unet_fp16_vmfbs"
 punet_int8_fp16_dir_compile = f"{vmfb_dir}/sdxl_punet_int8_fp16_vmfbs"
 punet_int8_fp8_dir_compile = f"{vmfb_dir}/sdxl_punet_int8_fp8_vmfbs"
 vae_decode_dir_compile = f"{vmfb_dir}/sdxl_vae_vmfbs"


 def run_iree_command(args: Sequence[str] = ()):
     command = "Exec:", " ".join(args)
     logging.getLogger().info(command)
     proc = subprocess.run(
         args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False
     )
     (
         stdout_v,
         stderr_v,
     ) = (
         proc.stdout,
         proc.stderr,
     )
     return_code = proc.returncode
     if return_code == 0:
         return 0, proc.stdout
     logging.getLogger().error(
         f"Command failed!\n"
         f"Stderr diagnostics:\n{proc.stderr}\n"
         f"Stdout diagnostics:\n{proc.stdout}\n"
     )
     return 1, proc.stdout


 def run_sdxl_rocm_benchmark(rocm_chip):
     exec_args = [
         "iree-compile",
         f"{benchmark_dir}/sdxl_pipeline_bench_f16.mlir",
         "--iree-hal-target-backends=rocm",
         f"--iree-hip-target={rocm_chip}",
         "--iree-global-opt-propagate-transposes=true",
         "--iree-codegen-llvmgpu-use-vector-distribution",
         "--iree-codegen-gpu-native-math-precision=true",
         "--iree-hip-waves-per-eu=2",
         "--iree-opt-outer-dim-concat=true",
         "--iree-llvmgpu-enable-prefetch",
         "-o",
         f"{benchmark_dir}/sdxl_full_pipeline_fp16_rocm.vmfb",
     ]
     # iree compile command for full sdxl pipeline
     ret_value, stdout = run_iree_command(exec_args)
     if ret_value == 1:
         return 1, stdout
     exec_args = [
         "iree-benchmark-module",
         f"--device=hip",
         "--device_allocator=caching",
         f"--module={prompt_encoder_dir_compile}/model.rocm_{rocm_chip}.vmfb",
         f"--parameters=model={prompt_encoder_dir}/real_weights.irpa",
         f"--module={scheduled_unet_dir_compile}/model.rocm_{rocm_chip}.vmfb",
         f"--parameters=model={scheduled_unet_dir}/real_weights.irpa",
         f"--module={vae_decode_dir_compile}/model.rocm_{rocm_chip}.vmfb",
         f"--parameters=model={vae_decode_dir}/real_weights.irpa",
         f"--module={benchmark_dir}/sdxl_full_pipeline_fp16_rocm.vmfb",
         "--function=tokens_to_image",
         "--input=1x4x128x128xf16",
         "--input=1xf16",
         "--input=1x64xi64",
         "--input=1x64xi64",
         "--input=1x64xi64",
         "--input=1x64xi64",
         "--benchmark_repetitions=10",
         "--benchmark_min_warmup_time=3.0",
     ]
     # iree benchmark command for full sdxl pipeline
     return run_iree_command(exec_args)


 def run_sdxl_unet_rocm_benchmark(rocm_chip):
     exec_args = [
         "iree-benchmark-module",
         f"--device=hip",
         "--device_allocator=caching",
         f"--module={scheduled_unet_dir_compile}/model.rocm_{rocm_chip}.vmfb",
         f"--parameters=model={scheduled_unet_dir}/real_weights.irpa",
         "--function=run_forward",
         "--input=1x4x128x128xf16",
         "--input=2x64x2048xf16",
         "--input=2x1280xf16",
         "--input=2x6xf16",
         "--input=1xf16",
         "--input=1xi64",
         "--benchmark_repetitions=10",
         "--benchmark_min_warmup_time=3.0",
     ]
     # iree benchmark command for full sdxl pipeline
     return run_iree_command(exec_args)


 def run_sdxl_punet_int8_fp16_rocm_benchmark(rocm_chip):
     exec_args = [
         "iree-benchmark-module",
         f"--device=hip",
         "--device_allocator=caching",
         f"--module={punet_int8_fp16_dir_compile}/punet_fp16.rocm_{rocm_chip}.vmfb",
         f"--parameters=model={punet_int8_fp16_dir}/punet_weights.irpa",
         "--function=main",
         f"--input=1x4x128x128xf16",
         f"--input=1xf16",
         f"--input=2x64x2048xf16",
         f"--input=2x1280xf16",
         f"--input=2x6xf16",
         f"--input=1xf16",
         "--benchmark_repetitions=10",
         "--benchmark_min_warmup_time=3.0",
     ]
     # iree benchmark command for full sdxl pipeline
     return run_iree_command(exec_args)


 def run_sdxl_punet_int8_fp8_rocm_benchmark(rocm_chip):
     exec_args = [
         "iree-benchmark-module",
         f"--device=hip",
         "--device_allocator=caching",
         f"--module={punet_int8_fp8_dir_compile}/punet_fp8.rocm_{rocm_chip}.vmfb",
         f"--parameters=model={punet_int8_fp8_dir}/punet_fp8_weights.irpa",
         "--function=main",
         f"--input=1x4x128x128xf16",
         f"--input=1xf16",
         f"--input=2x64x2048xf16",
         f"--input=2x1280xf16",
         f"--input=2x6xf16",
         f"--input=1xf16",
         "--benchmark_repetitions=10",
         "--benchmark_min_warmup_time=3.0",
     ]
     # iree benchmark command for full sdxl pipeline
     return run_iree_command(exec_args)


 def run_sdxl_prompt_encoder_rocm_benchmark(rocm_chip):
     exec_args = [
         "iree-benchmark-module",
         f"--device=hip",
         "--device_allocator=caching",
         f"--module={prompt_encoder_dir_compile}/model.rocm_{rocm_chip}.vmfb",
         f"--parameters=model={prompt_encoder_dir}/real_weights.irpa",
         "--function=encode_prompts",
         "--input=1x64xi64",
         "--input=1x64xi64",
         "--input=1x64xi64",
         "--input=1x64xi64",
         "--benchmark_repetitions=10",
         "--benchmark_min_warmup_time=3.0",
     ]
     # iree benchmark command for full sdxl pipeline
     return run_iree_command(exec_args)


 def run_sdxl_vae_decode_rocm_benchmark(rocm_chip):
     exec_args = [
         "iree-benchmark-module",
         f"--device=hip",
         "--device_allocator=caching",
         f"--module={vae_decode_dir_compile}/model.rocm_{rocm_chip}.vmfb",
         f"--parameters=model={vae_decode_dir}/real_weights.irpa",
         "--function=main",
         "--input=1x4x128x128xf16",
         "--benchmark_repetitions=10",
         "--benchmark_min_warmup_time=3.0",
     ]
     # iree benchmark command for full sdxl pipeline
     return run_iree_command(exec_args)


 BenchmarkResult = namedtuple(
     "BenchmarkResult", "benchmark_name time cpu_time iterations user_counters"
 )


 def decode_output(bench_lines):
     benchmark_results = []
     for line in bench_lines:
         split = line.split()
         if len(split) == 0:
             continue
         benchmark_name = split[0]
         time = " ".join(split[1:3])
         cpu_time = " ".join(split[3:5])
         iterations = split[5]
         user_counters = None
         if len(split) > 5:
             user_counters = split[6]
         benchmark_results.append(
             BenchmarkResult(
                 benchmark_name=benchmark_name,
                 time=time,
                 cpu_time=cpu_time,
                 iterations=iterations,
                 user_counters=user_counters,
             )
         )
     return benchmark_results


 def job_summary_process(ret_value, output):
     if ret_value == 1:
         # Output should have already been logged earlier.
         logging.getLogger().error("Running SDXL ROCm benchmark failed. Exiting.")
         return

     bench_lines = output.decode().split("\n")[3:]
     benchmark_results = decode_output(bench_lines)
     logging.getLogger().info(benchmark_results)
     benchmark_mean_time = float(benchmark_results[10].time.split()[0])
     return benchmark_mean_time


 def test_sdxl_rocm_benchmark(
     goldentime_rocm_e2e,
     goldentime_rocm_unet,
     goldentime_rocm_punet_int8_fp16,
     goldentime_rocm_punet_int8_fp8,
     goldentime_rocm_clip,
     goldentime_rocm_vae,
     rocm_chip,
     goldendispatch_rocm_unet,
     goldendispatch_rocm_punet_int8_fp16,
     goldendispatch_rocm_punet_int8_fp8,
     goldendispatch_rocm_clip,
     goldendispatch_rocm_vae,
     goldensize_rocm_unet,
     goldensize_rocm_punet_int8_fp16,
     goldensize_rocm_punet_int8_fp8,
     goldensize_rocm_clip,
     goldensize_rocm_vae,
 ):
     # e2e benchmark
     ret_value, output = run_sdxl_rocm_benchmark(rocm_chip)
     benchmark_e2e_mean_time = job_summary_process(ret_value, output)
     mean_line = (
         f"E2E Benchmark Time: {str(benchmark_e2e_mean_time)} ms"
         f" (golden time {goldentime_rocm_e2e} ms)"
     )
     logging.getLogger().info(mean_line)

     # unet benchmark
     ret_value, output = run_sdxl_unet_rocm_benchmark(rocm_chip)
     benchmark_unet_mean_time = job_summary_process(ret_value, output)
     mean_line = (
         f"Scheduled Unet Benchmark Time: {str(benchmark_unet_mean_time)} ms"
         f" (golden time {goldentime_rocm_unet} ms)"
     )
     logging.getLogger().info(mean_line)

     # unet compilation stats check
     with open(f"{scheduled_unet_dir_compile}/compilation_info.json", "r") as file:
         comp_stats = json.load(file)
     unet_dispatch_count = int(
         comp_stats["stream-aggregate"]["execution"]["dispatch-count"]
     )
     compilation_line = (
         f"Scheduled Unet Dispatch Count: {unet_dispatch_count}"
         f" (golden dispatch count {goldendispatch_rocm_unet})"
     )
     logging.getLogger().info(compilation_line)

     module_path = f"{scheduled_unet_dir_compile}/model.rocm_{rocm_chip}.vmfb"
     unet_binary_size = Path(module_path).stat().st_size
     compilation_line = (
         f"Scheduled Unet Binary Size: {unet_binary_size} bytes"
         f" (golden binary size {goldensize_rocm_unet} bytes)"
     )
     logging.getLogger().info(compilation_line)

     if rocm_chip == "gfx942":
         # punet int8 f16 attention benchmark
         ret_value, output = run_sdxl_punet_int8_fp16_rocm_benchmark(rocm_chip)
         benchmark_punet_int8_fp16_mean_time = job_summary_process(ret_value, output)
         mean_line = (
             f"Punet F16 Benchmark Time: {str(benchmark_punet_int8_fp16_mean_time)} ms"
             f" (golden time {goldentime_rocm_punet_int8_fp16} ms)"
         )
         logging.getLogger().info(mean_line)

         # punet int8 f16 compilation stats check
         with open(f"{punet_int8_fp16_dir_compile}/compilation_info.json", "r") as file:
             comp_stats = json.load(file)
         punet_int8_fp16_dispatch_count = int(
             comp_stats["stream-aggregate"]["execution"]["dispatch-count"]
         )
         compilation_line = (
             f"Punet F16 Dispatch Count: {punet_int8_fp16_dispatch_count}"
             f" (golden dispatch count {goldendispatch_rocm_punet_int8_fp16})"
         )
         logging.getLogger().info(compilation_line)

         module_path = f"{punet_int8_fp16_dir_compile}/punet_fp16.rocm_{rocm_chip}.vmfb"
         punet_int8_fp16_binary_size = Path(module_path).stat().st_size
         compilation_line = (
             f"Punet F16 Binary Size: {punet_int8_fp16_binary_size} bytes"
             f" (golden binary size {goldensize_rocm_punet_int8_fp16} bytes)"
         )
         logging.getLogger().info(compilation_line)

         # punet int8 f8 attention benchmark
         ret_value, output = run_sdxl_punet_int8_fp8_rocm_benchmark(rocm_chip)
         benchmark_punet_int8_fp8_mean_time = job_summary_process(ret_value, output)
         mean_line = (
             f"Punet F8 Benchmark Time: {str(benchmark_punet_int8_fp8_mean_time)} ms"
             f" (golden time {goldentime_rocm_punet_int8_fp8} ms)"
         )
         logging.getLogger().info(mean_line)

         # punet int8 f8 compilation stats check
         with open(f"{punet_int8_fp8_dir_compile}/compilation_info.json", "r") as file:
             comp_stats = json.load(file)
         punet_int8_fp8_dispatch_count = int(
             comp_stats["stream-aggregate"]["execution"]["dispatch-count"]
         )
         compilation_line = (
             f"Punet F8 Dispatch Count: {punet_int8_fp8_dispatch_count}"
             f" (golden dispatch count {goldendispatch_rocm_punet_int8_fp8})"
         )
         logging.getLogger().info(compilation_line)

         module_path = f"{punet_int8_fp8_dir_compile}/punet_fp8.rocm_{rocm_chip}.vmfb"
         punet_int8_fp8_binary_size = Path(module_path).stat().st_size
         compilation_line = (
             f"Punet F8 Binary Size: {punet_int8_fp8_binary_size} bytes"
             f" (golden binary size {goldensize_rocm_punet_int8_fp8} bytes)"
         )
         logging.getLogger().info(compilation_line)

     # prompt encoder benchmark
     ret_value, output = run_sdxl_prompt_encoder_rocm_benchmark(rocm_chip)
     benchmark_clip_mean_time = job_summary_process(ret_value, output)
     mean_line = (
         f"Prompt Encoder Benchmark Time: {str(benchmark_clip_mean_time)} ms"
         f" (golden time {goldentime_rocm_clip} ms)"
     )
     logging.getLogger().info(mean_line)

     # prompt encoder compilation stats check
     with open(f"{prompt_encoder_dir_compile}/compilation_info.json", "r") as file:
         comp_stats = json.load(file)
     clip_dispatch_count = int(
         comp_stats["stream-aggregate"]["execution"]["dispatch-count"]
     )
     compilation_line = (
         f"Prompt Encoder Dispatch Count: {clip_dispatch_count}"
         f" (golden dispatch count {goldendispatch_rocm_clip})"
     )
     logging.getLogger().info(compilation_line)

     module_path = f"{prompt_encoder_dir_compile}/model.rocm_{rocm_chip}.vmfb"
     clip_binary_size = Path(module_path).stat().st_size
     compilation_line = (
         f"Prompt Encoder Binary Size: {clip_binary_size} bytes"
         f" (golden binary size {goldensize_rocm_clip} bytes)"
     )
     logging.getLogger().info(compilation_line)

     # vae decode benchmark
     ret_value, output = run_sdxl_vae_decode_rocm_benchmark(rocm_chip)
     benchmark_vae_mean_time = job_summary_process(ret_value, output)
     mean_line = (
         f"VAE Decode Benchmark Time: {str(benchmark_vae_mean_time)} ms"
         f" (golden time {goldentime_rocm_vae} ms)"
     )
     logging.getLogger().info(mean_line)

     # vae decode compilation stats check
     with open(f"{vae_decode_dir_compile}/compilation_info.json", "r") as file:
         comp_stats = json.load(file)
     vae_dispatch_count = int(
         comp_stats["stream-aggregate"]["execution"]["dispatch-count"]
     )
     compilation_line = (
         f"VAE Decode Dispatch Count: {vae_dispatch_count}"
         f" (golden dispatch count {goldendispatch_rocm_vae})"
     )
     logging.getLogger().info(compilation_line)

     module_path = f"{vae_decode_dir_compile}/model.rocm_{rocm_chip}.vmfb"
     vae_binary_size = Path(module_path).stat().st_size
     compilation_line = (
         f"VAE Decode Binary Size: {vae_binary_size} bytes"
         f" (golden binary size {goldensize_rocm_vae} bytes)"
     )
     logging.getLogger().info(compilation_line)

     # Create mean time table's header and rows
     mean_time_header = ["Benchmark", "Current time (ms)", "Expected/golden time (ms)"]
     mean_time_rows = [
         ["E2E†", f"{benchmark_e2e_mean_time}", f"{goldentime_rocm_e2e}"],
         ["Scheduled Unet", f"{benchmark_unet_mean_time}", f"{goldentime_rocm_unet}"],
         ["Prompt Encoder", f"{benchmark_clip_mean_time}", f"{goldentime_rocm_clip}"],
         ["VAE Decode", f"{benchmark_vae_mean_time}", f"{goldentime_rocm_vae}"],
     ]
     if rocm_chip == "gfx942":
         mean_time_rows.append(
             [
                 "Punet F16",
                 f"{benchmark_punet_int8_fp16_mean_time}",
                 f"{goldentime_rocm_punet_int8_fp16}",
             ]
         )
         mean_time_rows.append(
             [
                 "Punet F8",
                 f"{benchmark_punet_int8_fp8_mean_time}",
                 f"{goldentime_rocm_punet_int8_fp8}",
             ]
         )

     # Create dispatch count table's header and rows
     dispatch_count_header = [
         "Benchmark",
         "Current dispatch count",
         "Expected/golden dispatch count",
     ]
     dispatch_count_rows = [
         ["Scheduled Unet", f"{unet_dispatch_count}", f"{goldendispatch_rocm_unet}"],
         ["Prompt Encoder", f"{clip_dispatch_count}", f"{goldendispatch_rocm_clip}"],
         ["VAE Decode", f"{vae_dispatch_count}", f"{goldendispatch_rocm_vae}"],
     ]
     if rocm_chip == "gfx942":
         dispatch_count_rows.append(
             [
                 "Punet F16",
                 f"{punet_int8_fp16_dispatch_count}",
                 f"{goldendispatch_rocm_punet_int8_fp16}",
             ]
         )
         dispatch_count_rows.append(
             [
                 "Punet F8",
                 f"{punet_int8_fp8_dispatch_count}",
                 f"{goldendispatch_rocm_punet_int8_fp8}",
             ]
         )

     # Create binary size table's header and rows
     binary_size_header = [
         "Benchmark",
         "Current binary size (bytes)",
         "Expected/golden binary size (bytes)",
     ]
     binary_size_rows = [
         ["Scheduled Unet", f"{unet_binary_size}", f"{goldensize_rocm_unet}"],
         ["Prompt Encoder", f"{clip_binary_size}", f"{goldensize_rocm_clip}"],
         ["VAE Decode", f"{vae_binary_size}", f"{goldensize_rocm_vae}"],
     ]
     if rocm_chip == "gfx942":
         binary_size_rows.append(
             [
                 "Punet F16",
                 f"{punet_int8_fp16_binary_size}",
                 f"{goldensize_rocm_punet_int8_fp16}",
             ]
         )
         binary_size_rows.append(
             [
                 "Punet F8",
                 f"{punet_int8_fp8_binary_size}",
                 f"{goldensize_rocm_punet_int8_fp8}",
             ]
         )

     # Create mean time table using tabulate
     mean_time_full = [mean_time_header] + mean_time_rows
     mean_time_table = tabulate.tabulate(
         mean_time_full, headers="firstrow", tablefmt="pipe"
     )

     # Create dispatch count table using tabulate
     dispatch_count_full = [dispatch_count_header] + dispatch_count_rows
     dispatch_count_table = tabulate.tabulate(
         dispatch_count_full, headers="firstrow", tablefmt="pipe"
     )

     # Create binary size of compiled artifacts table using tabulate
     binary_size_full = [binary_size_header] + binary_size_rows
     binary_size_table = tabulate.tabulate(
         binary_size_full, headers="firstrow", tablefmt="pipe"
     )

     # Write markdown tables to job summary file
     with open("job_summary.md", "w") as job_summary:
         print("SDXL Benchmark Summary:\n", file=job_summary)
         print(mean_time_table, file=job_summary)
         print("\n† E2E = Encode + Scheduled Unet * 3 + Decode\n", file=job_summary)
         print(dispatch_count_table, file=job_summary)
         print("\n", file=job_summary)
         print(binary_size_table, file=job_summary)

     # Check all values are either <= than golden values for times and == for compilation statistics.

     check.less_equal(
         benchmark_e2e_mean_time,
         goldentime_rocm_e2e,
         "SDXL e2e benchmark time should not regress",
     )
     check.less_equal(
         benchmark_unet_mean_time,
         goldentime_rocm_unet,
         "SDXL unet benchmark time should not regress",
     )
     check.less_equal(
         unet_dispatch_count,
         goldendispatch_rocm_unet,
         "SDXL scheduled unet dispatch count should not regress",
     )
     check.less_equal(
         unet_binary_size,
         goldensize_rocm_unet,
         "SDXL scheduled unet binary size should not get bigger",
     )
     if rocm_chip == "gfx942":
         check.less_equal(
             benchmark_punet_int8_fp16_mean_time,
             goldentime_rocm_punet_int8_fp16,
             "SDXL punet f16 benchmark time should not regress",
         )
         check.less_equal(
             punet_int8_fp16_dispatch_count,
             goldendispatch_rocm_punet_int8_fp16,
             "SDXL punet f16 dispatch count should not regress",
         )
         check.less_equal(
             punet_int8_fp16_binary_size,
             goldensize_rocm_punet_int8_fp16,
             "SDXL punet f16 binary size should not get bigger",
         )
         check.less_equal(
             benchmark_punet_int8_fp8_mean_time,
             goldentime_rocm_punet_int8_fp8,
             "SDXL punet f8 benchmark time should not regress",
         )
         check.less_equal(
             punet_int8_fp8_dispatch_count,
             goldendispatch_rocm_punet_int8_fp8,
             "SDXL punet f8 dispatch count should not regress",
         )
         check.less_equal(
             punet_int8_fp8_binary_size,
             goldensize_rocm_punet_int8_fp8,
             "SDXL punet f8 binary size should not get bigger",
         )
     check.less_equal(
         benchmark_clip_mean_time,
         goldentime_rocm_clip,
         "SDXL prompt encoder benchmark time should not regress",
     )
     check.less_equal(
         clip_dispatch_count,
         goldendispatch_rocm_clip,
         "SDXL prompt encoder dispatch count should not regress",
     )
     check.less_equal(
         clip_binary_size,
         goldensize_rocm_clip,
         "SDXL prompt encoder binary size should not get bigger",
     )
     check.less_equal(
         benchmark_vae_mean_time,
         goldentime_rocm_vae,
         "SDXL vae decode benchmark time should not regress",
     )
     check.less_equal(
         vae_dispatch_count,
         goldendispatch_rocm_vae,
         "SDXL vae decode dispatch count should not regress",
     )
     check.less_equal(
         vae_binary_size,
         goldensize_rocm_vae,
         "SDXL vae decode binary size should not get bigger",
     )
	# Copyright 2024 The IREE Authors
	#
	# Licensed under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	import os
	from collections import namedtuple
	import logging
	from typing import Sequence
	import subprocess
	import json
	from pathlib import Path
	import tabulate
	from pytest_check import check

	vmfb_dir = os.getenv("TEST_OUTPUT_ARTIFACTS", default=Path.cwd())
	benchmark_dir = os.path.dirname(os.path.realpath(__file__))
	artifacts_dir = f"{os.getenv('IREE_TEST_FILES', default=Path.cwd())}/artifacts"
	artifacts_dir = Path(os.path.expanduser(artifacts_dir)).resolve()
	prompt_encoder_dir = f"{artifacts_dir}/sdxl_clip"
	scheduled_unet_dir = f"{artifacts_dir}/sdxl_unet_fp16"
	punet_int8_fp16_dir = f"{artifacts_dir}/sdxl_punet_int8_fp16"
	punet_int8_fp8_dir = f"{artifacts_dir}/sdxl_punet_int8_fp8"
	vae_decode_dir = f"{artifacts_dir}/sdxl_vae"
	prompt_encoder_dir_compile = f"{vmfb_dir}/sdxl_clip_vmfbs"
	scheduled_unet_dir_compile = f"{vmfb_dir}/sdxl_unet_fp16_vmfbs"
	punet_int8_fp16_dir_compile = f"{vmfb_dir}/sdxl_punet_int8_fp16_vmfbs"
	punet_int8_fp8_dir_compile = f"{vmfb_dir}/sdxl_punet_int8_fp8_vmfbs"
	vae_decode_dir_compile = f"{vmfb_dir}/sdxl_vae_vmfbs"


	def run_iree_command(args: Sequence[str] = ()):
	command = "Exec:", " ".join(args)
	logging.getLogger().info(command)
	proc = subprocess.run(
	args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False
	)
	(
	stdout_v,
	stderr_v,
	) = (
	proc.stdout,
	proc.stderr,
	)
	return_code = proc.returncode
	if return_code == 0:
	return 0, proc.stdout
	logging.getLogger().error(
	f"Command failed!\n"
	f"Stderr diagnostics:\n{proc.stderr}\n"
	f"Stdout diagnostics:\n{proc.stdout}\n"
	)
	return 1, proc.stdout


	def run_sdxl_rocm_benchmark(rocm_chip):
	exec_args = [
	"iree-compile",
	f"{benchmark_dir}/sdxl_pipeline_bench_f16.mlir",
	"--iree-hal-target-backends=rocm",
	f"--iree-hip-target={rocm_chip}",
	"--iree-global-opt-propagate-transposes=true",
	"--iree-codegen-llvmgpu-use-vector-distribution",
	"--iree-codegen-gpu-native-math-precision=true",
	"--iree-hip-waves-per-eu=2",
	"--iree-opt-outer-dim-concat=true",
	"--iree-llvmgpu-enable-prefetch",
	"-o",
	f"{benchmark_dir}/sdxl_full_pipeline_fp16_rocm.vmfb",
	]
	# iree compile command for full sdxl pipeline
	ret_value, stdout = run_iree_command(exec_args)
	if ret_value == 1:
	return 1, stdout
	exec_args = [
	"iree-benchmark-module",
	f"--device=hip",
	"--device_allocator=caching",
	f"--module={prompt_encoder_dir_compile}/model.rocm_{rocm_chip}.vmfb",
	f"--parameters=model={prompt_encoder_dir}/real_weights.irpa",
	f"--module={scheduled_unet_dir_compile}/model.rocm_{rocm_chip}.vmfb",
	f"--parameters=model={scheduled_unet_dir}/real_weights.irpa",
	f"--module={vae_decode_dir_compile}/model.rocm_{rocm_chip}.vmfb",
	f"--parameters=model={vae_decode_dir}/real_weights.irpa",
	f"--module={benchmark_dir}/sdxl_full_pipeline_fp16_rocm.vmfb",
	"--function=tokens_to_image",
	"--input=1x4x128x128xf16",
	"--input=1xf16",
	"--input=1x64xi64",
	"--input=1x64xi64",
	"--input=1x64xi64",
	"--input=1x64xi64",
	"--benchmark_repetitions=10",
	"--benchmark_min_warmup_time=3.0",
	]
	# iree benchmark command for full sdxl pipeline
	return run_iree_command(exec_args)


	def run_sdxl_unet_rocm_benchmark(rocm_chip):
	exec_args = [
	"iree-benchmark-module",
	f"--device=hip",
	"--device_allocator=caching",
	f"--module={scheduled_unet_dir_compile}/model.rocm_{rocm_chip}.vmfb",
	f"--parameters=model={scheduled_unet_dir}/real_weights.irpa",
	"--function=run_forward",
	"--input=1x4x128x128xf16",
	"--input=2x64x2048xf16",
	"--input=2x1280xf16",
	"--input=2x6xf16",
	"--input=1xf16",
	"--input=1xi64",
	"--benchmark_repetitions=10",
	"--benchmark_min_warmup_time=3.0",
	]
	# iree benchmark command for full sdxl pipeline
	return run_iree_command(exec_args)


	def run_sdxl_punet_int8_fp16_rocm_benchmark(rocm_chip):
	exec_args = [
	"iree-benchmark-module",
	f"--device=hip",
	"--device_allocator=caching",
	f"--module={punet_int8_fp16_dir_compile}/punet_fp16.rocm_{rocm_chip}.vmfb",
	f"--parameters=model={punet_int8_fp16_dir}/punet_weights.irpa",
	"--function=main",
	f"--input=1x4x128x128xf16",
	f"--input=1xf16",
	f"--input=2x64x2048xf16",
	f"--input=2x1280xf16",
	f"--input=2x6xf16",
	f"--input=1xf16",
	"--benchmark_repetitions=10",
	"--benchmark_min_warmup_time=3.0",
	]
	# iree benchmark command for full sdxl pipeline
	return run_iree_command(exec_args)


	def run_sdxl_punet_int8_fp8_rocm_benchmark(rocm_chip):
	exec_args = [
	"iree-benchmark-module",
	f"--device=hip",
	"--device_allocator=caching",
	f"--module={punet_int8_fp8_dir_compile}/punet_fp8.rocm_{rocm_chip}.vmfb",
	f"--parameters=model={punet_int8_fp8_dir}/punet_fp8_weights.irpa",
	"--function=main",
	f"--input=1x4x128x128xf16",
	f"--input=1xf16",
	f"--input=2x64x2048xf16",
	f"--input=2x1280xf16",
	f"--input=2x6xf16",
	f"--input=1xf16",
	"--benchmark_repetitions=10",
	"--benchmark_min_warmup_time=3.0",
	]
	# iree benchmark command for full sdxl pipeline
	return run_iree_command(exec_args)


	def run_sdxl_prompt_encoder_rocm_benchmark(rocm_chip):
	exec_args = [
	"iree-benchmark-module",
	f"--device=hip",
	"--device_allocator=caching",
	f"--module={prompt_encoder_dir_compile}/model.rocm_{rocm_chip}.vmfb",
	f"--parameters=model={prompt_encoder_dir}/real_weights.irpa",
	"--function=encode_prompts",
	"--input=1x64xi64",
	"--input=1x64xi64",
	"--input=1x64xi64",
	"--input=1x64xi64",
	"--benchmark_repetitions=10",
	"--benchmark_min_warmup_time=3.0",
	]
	# iree benchmark command for full sdxl pipeline
	return run_iree_command(exec_args)


	def run_sdxl_vae_decode_rocm_benchmark(rocm_chip):
	exec_args = [
	"iree-benchmark-module",
	f"--device=hip",
	"--device_allocator=caching",
	f"--module={vae_decode_dir_compile}/model.rocm_{rocm_chip}.vmfb",
	f"--parameters=model={vae_decode_dir}/real_weights.irpa",
	"--function=main",
	"--input=1x4x128x128xf16",
	"--benchmark_repetitions=10",
	"--benchmark_min_warmup_time=3.0",
	]
	# iree benchmark command for full sdxl pipeline
	return run_iree_command(exec_args)


	BenchmarkResult = namedtuple(
	"BenchmarkResult", "benchmark_name time cpu_time iterations user_counters"
	)


	def decode_output(bench_lines):
	benchmark_results = []
	for line in bench_lines:
	split = line.split()
	if len(split) == 0:
	continue
	benchmark_name = split[0]
	time = " ".join(split[1:3])
	cpu_time = " ".join(split[3:5])
	iterations = split[5]
	user_counters = None
	if len(split) > 5:
	user_counters = split[6]
	benchmark_results.append(
	BenchmarkResult(
	benchmark_name=benchmark_name,
	time=time,
	cpu_time=cpu_time,
	iterations=iterations,
	user_counters=user_counters,
	)
	)
	return benchmark_results


	def job_summary_process(ret_value, output):
	if ret_value == 1:
	# Output should have already been logged earlier.
	logging.getLogger().error("Running SDXL ROCm benchmark failed. Exiting.")
	return

	bench_lines = output.decode().split("\n")[3:]
	benchmark_results = decode_output(bench_lines)
	logging.getLogger().info(benchmark_results)
	benchmark_mean_time = float(benchmark_results[10].time.split()[0])
	return benchmark_mean_time


	def test_sdxl_rocm_benchmark(
	goldentime_rocm_e2e,
	goldentime_rocm_unet,
	goldentime_rocm_punet_int8_fp16,
	goldentime_rocm_punet_int8_fp8,
	goldentime_rocm_clip,
	goldentime_rocm_vae,
	rocm_chip,
	goldendispatch_rocm_unet,
	goldendispatch_rocm_punet_int8_fp16,
	goldendispatch_rocm_punet_int8_fp8,
	goldendispatch_rocm_clip,
	goldendispatch_rocm_vae,
	goldensize_rocm_unet,
	goldensize_rocm_punet_int8_fp16,
	goldensize_rocm_punet_int8_fp8,
	goldensize_rocm_clip,
	goldensize_rocm_vae,
	):
	# e2e benchmark
	ret_value, output = run_sdxl_rocm_benchmark(rocm_chip)
	benchmark_e2e_mean_time = job_summary_process(ret_value, output)
	mean_line = (
	f"E2E Benchmark Time: {str(benchmark_e2e_mean_time)} ms"
	f" (golden time {goldentime_rocm_e2e} ms)"
	)
	logging.getLogger().info(mean_line)

	# unet benchmark
	ret_value, output = run_sdxl_unet_rocm_benchmark(rocm_chip)
	benchmark_unet_mean_time = job_summary_process(ret_value, output)
	mean_line = (
	f"Scheduled Unet Benchmark Time: {str(benchmark_unet_mean_time)} ms"
	f" (golden time {goldentime_rocm_unet} ms)"
	)
	logging.getLogger().info(mean_line)

	# unet compilation stats check
	with open(f"{scheduled_unet_dir_compile}/compilation_info.json", "r") as file:
	comp_stats = json.load(file)
	unet_dispatch_count = int(
	comp_stats["stream-aggregate"]["execution"]["dispatch-count"]
	)
	compilation_line = (
	f"Scheduled Unet Dispatch Count: {unet_dispatch_count}"
	f" (golden dispatch count {goldendispatch_rocm_unet})"
	)
	logging.getLogger().info(compilation_line)

	module_path = f"{scheduled_unet_dir_compile}/model.rocm_{rocm_chip}.vmfb"
	unet_binary_size = Path(module_path).stat().st_size
	compilation_line = (
	f"Scheduled Unet Binary Size: {unet_binary_size} bytes"
	f" (golden binary size {goldensize_rocm_unet} bytes)"
	)
	logging.getLogger().info(compilation_line)

	if rocm_chip == "gfx942":
	# punet int8 f16 attention benchmark
	ret_value, output = run_sdxl_punet_int8_fp16_rocm_benchmark(rocm_chip)
	benchmark_punet_int8_fp16_mean_time = job_summary_process(ret_value, output)
	mean_line = (
	f"Punet F16 Benchmark Time: {str(benchmark_punet_int8_fp16_mean_time)} ms"
	f" (golden time {goldentime_rocm_punet_int8_fp16} ms)"
	)
	logging.getLogger().info(mean_line)

	# punet int8 f16 compilation stats check
	with open(f"{punet_int8_fp16_dir_compile}/compilation_info.json", "r") as file:
	comp_stats = json.load(file)
	punet_int8_fp16_dispatch_count = int(
	comp_stats["stream-aggregate"]["execution"]["dispatch-count"]
	)
	compilation_line = (
	f"Punet F16 Dispatch Count: {punet_int8_fp16_dispatch_count}"
	f" (golden dispatch count {goldendispatch_rocm_punet_int8_fp16})"
	)
	logging.getLogger().info(compilation_line)

	module_path = f"{punet_int8_fp16_dir_compile}/punet_fp16.rocm_{rocm_chip}.vmfb"
	punet_int8_fp16_binary_size = Path(module_path).stat().st_size
	compilation_line = (
	f"Punet F16 Binary Size: {punet_int8_fp16_binary_size} bytes"
	f" (golden binary size {goldensize_rocm_punet_int8_fp16} bytes)"
	)
	logging.getLogger().info(compilation_line)

	# punet int8 f8 attention benchmark
	ret_value, output = run_sdxl_punet_int8_fp8_rocm_benchmark(rocm_chip)
	benchmark_punet_int8_fp8_mean_time = job_summary_process(ret_value, output)
	mean_line = (
	f"Punet F8 Benchmark Time: {str(benchmark_punet_int8_fp8_mean_time)} ms"
	f" (golden time {goldentime_rocm_punet_int8_fp8} ms)"
	)
	logging.getLogger().info(mean_line)

	# punet int8 f8 compilation stats check
	with open(f"{punet_int8_fp8_dir_compile}/compilation_info.json", "r") as file:
	comp_stats = json.load(file)
	punet_int8_fp8_dispatch_count = int(
	comp_stats["stream-aggregate"]["execution"]["dispatch-count"]
	)
	compilation_line = (
	f"Punet F8 Dispatch Count: {punet_int8_fp8_dispatch_count}"
	f" (golden dispatch count {goldendispatch_rocm_punet_int8_fp8})"
	)
	logging.getLogger().info(compilation_line)

	module_path = f"{punet_int8_fp8_dir_compile}/punet_fp8.rocm_{rocm_chip}.vmfb"
	punet_int8_fp8_binary_size = Path(module_path).stat().st_size
	compilation_line = (
	f"Punet F8 Binary Size: {punet_int8_fp8_binary_size} bytes"
	f" (golden binary size {goldensize_rocm_punet_int8_fp8} bytes)"
	)
	logging.getLogger().info(compilation_line)

	# prompt encoder benchmark
	ret_value, output = run_sdxl_prompt_encoder_rocm_benchmark(rocm_chip)
	benchmark_clip_mean_time = job_summary_process(ret_value, output)
	mean_line = (
	f"Prompt Encoder Benchmark Time: {str(benchmark_clip_mean_time)} ms"
	f" (golden time {goldentime_rocm_clip} ms)"
	)
	logging.getLogger().info(mean_line)

	# prompt encoder compilation stats check
	with open(f"{prompt_encoder_dir_compile}/compilation_info.json", "r") as file:
	comp_stats = json.load(file)
	clip_dispatch_count = int(
	comp_stats["stream-aggregate"]["execution"]["dispatch-count"]
	)
	compilation_line = (
	f"Prompt Encoder Dispatch Count: {clip_dispatch_count}"
	f" (golden dispatch count {goldendispatch_rocm_clip})"
	)
	logging.getLogger().info(compilation_line)

	module_path = f"{prompt_encoder_dir_compile}/model.rocm_{rocm_chip}.vmfb"
	clip_binary_size = Path(module_path).stat().st_size
	compilation_line = (
	f"Prompt Encoder Binary Size: {clip_binary_size} bytes"
	f" (golden binary size {goldensize_rocm_clip} bytes)"
	)
	logging.getLogger().info(compilation_line)

	# vae decode benchmark
	ret_value, output = run_sdxl_vae_decode_rocm_benchmark(rocm_chip)
	benchmark_vae_mean_time = job_summary_process(ret_value, output)
	mean_line = (
	f"VAE Decode Benchmark Time: {str(benchmark_vae_mean_time)} ms"
	f" (golden time {goldentime_rocm_vae} ms)"
	)
	logging.getLogger().info(mean_line)

	# vae decode compilation stats check
	with open(f"{vae_decode_dir_compile}/compilation_info.json", "r") as file:
	comp_stats = json.load(file)
	vae_dispatch_count = int(
	comp_stats["stream-aggregate"]["execution"]["dispatch-count"]
	)
	compilation_line = (
	f"VAE Decode Dispatch Count: {vae_dispatch_count}"
	f" (golden dispatch count {goldendispatch_rocm_vae})"
	)
	logging.getLogger().info(compilation_line)

	module_path = f"{vae_decode_dir_compile}/model.rocm_{rocm_chip}.vmfb"
	vae_binary_size = Path(module_path).stat().st_size
	compilation_line = (
	f"VAE Decode Binary Size: {vae_binary_size} bytes"
	f" (golden binary size {goldensize_rocm_vae} bytes)"
	)
	logging.getLogger().info(compilation_line)

	# Create mean time table's header and rows
	mean_time_header = ["Benchmark", "Current time (ms)", "Expected/golden time (ms)"]
	mean_time_rows = [
	["E2E†", f"{benchmark_e2e_mean_time}", f"{goldentime_rocm_e2e}"],
	["Scheduled Unet", f"{benchmark_unet_mean_time}", f"{goldentime_rocm_unet}"],
	["Prompt Encoder", f"{benchmark_clip_mean_time}", f"{goldentime_rocm_clip}"],
	["VAE Decode", f"{benchmark_vae_mean_time}", f"{goldentime_rocm_vae}"],
	]
	if rocm_chip == "gfx942":
	mean_time_rows.append(
	[
	"Punet F16",
	f"{benchmark_punet_int8_fp16_mean_time}",
	f"{goldentime_rocm_punet_int8_fp16}",
	]
	)
	mean_time_rows.append(
	[
	"Punet F8",
	f"{benchmark_punet_int8_fp8_mean_time}",
	f"{goldentime_rocm_punet_int8_fp8}",
	]
	)

	# Create dispatch count table's header and rows
	dispatch_count_header = [
	"Benchmark",
	"Current dispatch count",
	"Expected/golden dispatch count",
	]
	dispatch_count_rows = [
	["Scheduled Unet", f"{unet_dispatch_count}", f"{goldendispatch_rocm_unet}"],
	["Prompt Encoder", f"{clip_dispatch_count}", f"{goldendispatch_rocm_clip}"],
	["VAE Decode", f"{vae_dispatch_count}", f"{goldendispatch_rocm_vae}"],
	]
	if rocm_chip == "gfx942":
	dispatch_count_rows.append(
	[
	"Punet F16",
	f"{punet_int8_fp16_dispatch_count}",
	f"{goldendispatch_rocm_punet_int8_fp16}",
	]
	)
	dispatch_count_rows.append(
	[
	"Punet F8",
	f"{punet_int8_fp8_dispatch_count}",
	f"{goldendispatch_rocm_punet_int8_fp8}",
	]
	)

	# Create binary size table's header and rows
	binary_size_header = [
	"Benchmark",
	"Current binary size (bytes)",
	"Expected/golden binary size (bytes)",
	]
	binary_size_rows = [
	["Scheduled Unet", f"{unet_binary_size}", f"{goldensize_rocm_unet}"],
	["Prompt Encoder", f"{clip_binary_size}", f"{goldensize_rocm_clip}"],
	["VAE Decode", f"{vae_binary_size}", f"{goldensize_rocm_vae}"],
	]
	if rocm_chip == "gfx942":
	binary_size_rows.append(
	[
	"Punet F16",
	f"{punet_int8_fp16_binary_size}",
	f"{goldensize_rocm_punet_int8_fp16}",
	]
	)
	binary_size_rows.append(
	[
	"Punet F8",
	f"{punet_int8_fp8_binary_size}",
	f"{goldensize_rocm_punet_int8_fp8}",
	]
	)

	# Create mean time table using tabulate
	mean_time_full = [mean_time_header] + mean_time_rows
	mean_time_table = tabulate.tabulate(
	mean_time_full, headers="firstrow", tablefmt="pipe"
	)

	# Create dispatch count table using tabulate
	dispatch_count_full = [dispatch_count_header] + dispatch_count_rows
	dispatch_count_table = tabulate.tabulate(
	dispatch_count_full, headers="firstrow", tablefmt="pipe"
	)

	# Create binary size of compiled artifacts table using tabulate
	binary_size_full = [binary_size_header] + binary_size_rows
	binary_size_table = tabulate.tabulate(
	binary_size_full, headers="firstrow", tablefmt="pipe"
	)

	# Write markdown tables to job summary file
	with open("job_summary.md", "w") as job_summary:
	print("SDXL Benchmark Summary:\n", file=job_summary)
	print(mean_time_table, file=job_summary)
	print("\n† E2E = Encode + Scheduled Unet * 3 + Decode\n", file=job_summary)
	print(dispatch_count_table, file=job_summary)
	print("\n", file=job_summary)
	print(binary_size_table, file=job_summary)

	# Check all values are either <= than golden values for times and == for compilation statistics.

	check.less_equal(
	benchmark_e2e_mean_time,
	goldentime_rocm_e2e,
	"SDXL e2e benchmark time should not regress",
	)
	check.less_equal(
	benchmark_unet_mean_time,
	goldentime_rocm_unet,
	"SDXL unet benchmark time should not regress",
	)
	check.less_equal(
	unet_dispatch_count,
	goldendispatch_rocm_unet,
	"SDXL scheduled unet dispatch count should not regress",
	)
	check.less_equal(
	unet_binary_size,
	goldensize_rocm_unet,
	"SDXL scheduled unet binary size should not get bigger",
	)
	if rocm_chip == "gfx942":
	check.less_equal(
	benchmark_punet_int8_fp16_mean_time,
	goldentime_rocm_punet_int8_fp16,
	"SDXL punet f16 benchmark time should not regress",
	)
	check.less_equal(
	punet_int8_fp16_dispatch_count,
	goldendispatch_rocm_punet_int8_fp16,
	"SDXL punet f16 dispatch count should not regress",
	)
	check.less_equal(
	punet_int8_fp16_binary_size,
	goldensize_rocm_punet_int8_fp16,
	"SDXL punet f16 binary size should not get bigger",
	)
	check.less_equal(
	benchmark_punet_int8_fp8_mean_time,
	goldentime_rocm_punet_int8_fp8,
	"SDXL punet f8 benchmark time should not regress",
	)
	check.less_equal(
	punet_int8_fp8_dispatch_count,
	goldendispatch_rocm_punet_int8_fp8,
	"SDXL punet f8 dispatch count should not regress",
	)
	check.less_equal(
	punet_int8_fp8_binary_size,
	goldensize_rocm_punet_int8_fp8,
	"SDXL punet f8 binary size should not get bigger",
	)
	check.less_equal(
	benchmark_clip_mean_time,
	goldentime_rocm_clip,
	"SDXL prompt encoder benchmark time should not regress",
	)
	check.less_equal(
	clip_dispatch_count,
	goldendispatch_rocm_clip,
	"SDXL prompt encoder dispatch count should not regress",
	)
	check.less_equal(
	clip_binary_size,
	goldensize_rocm_clip,
	"SDXL prompt encoder binary size should not get bigger",
	)
	check.less_equal(
	benchmark_vae_mean_time,
	goldentime_rocm_vae,
	"SDXL vae decode benchmark time should not regress",
	)
	check.less_equal(
	vae_dispatch_count,
	goldendispatch_rocm_vae,
	"SDXL vae decode dispatch count should not regress",
	)
	check.less_equal(
	vae_binary_size,
	goldensize_rocm_vae,
	"SDXL vae decode binary size should not get bigger",
	)