.github/workflows/pkgci_regression_test.yml - 3p/openxla/iree - Git at Google

 # Copyright 2023 The IREE Authors
 #
 # Licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 name: PkgCI Regression Test
 on:
   workflow_call:
     inputs:
       artifact_run_id:
         type: string
         default: ""
   workflow_dispatch:
     inputs:
       artifact_run_id:
         type: string
         default: ""

 jobs:
   test_onnx:
     name: "test_onnx :: ${{ matrix.name }}"
     runs-on: ${{ matrix.runs-on }}
     strategy:
       fail-fast: false
       matrix:
         include:
           # CPU
           - name: cpu_llvm_sync
             config-file: onnx_cpu_llvm_sync.json
             numprocesses: auto
             runs-on: ubuntu-20.04

           # AMD GPU
           - name: amdgpu_rocm_rdna3
             numprocesses: 1
             config-file: onnx_gpu_rocm_rdna3.json
             runs-on: nodai-amdgpu-w7900-x86-64
           - name: amdgpu_vulkan
             numprocesses: 4
             config-file: onnx_gpu_vulkan.json
             runs-on: nodai-amdgpu-w7900-x86-64

           # NVIDIA GPU
           - name: nvidiagpu_cuda
             config-file: onnx_gpu_cuda.json
             numprocesses: 4
             runs-on:
               - self-hosted # must come first
               - runner-group=${{ github.event_name == 'pull_request' && 'presubmit' || 'postsubmit' }}
               - environment=prod
               - gpu # TODO(scotttodd): qualify further with vendor/model
               - os-family=Linux
           - name: nvidiagpu_vulkan
             config-file: onnx_gpu_vulkan.json
             numprocesses: 4
             runs-on:
               - self-hosted # must come first
               - runner-group=${{ github.event_name == 'pull_request' && 'presubmit' || 'postsubmit' }}
               - environment=prod
               - gpu # TODO(scotttodd): qualify further with vendor/model
               - os-family=Linux
     env:
       PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
       IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts
       CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.config-file }}
       NUMPROCESSES: ${{ matrix.numprocesses }}
       LOG_FILE_PATH: /tmp/iree_tests_onnx_${{ matrix.name }}_logs.json
       VENV_DIR: ${{ github.workspace }}/venv
     steps:
       - name: Checking out IREE repository
         uses: actions/checkout@v4.1.7
         with:
           submodules: false
       - uses: actions/setup-python@v5.1.0
         with:
           # Must match the subset of versions built in pkgci_build_packages.
           python-version: "3.11"
       - uses: actions/download-artifact@v4.1.7
         with:
           name: linux_x86_64_release_packages
           path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
       - name: Setup venv
         run: |
           ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
             --artifact-path=${PACKAGE_DOWNLOAD_DIR} \
             --fetch-gh-workflow=${{ inputs.artifact_run_id }}

       - name: Check out external TestSuite repository
         uses: actions/checkout@v4.1.7
         with:
           repository: nod-ai/SHARK-TestSuite
           ref: 1da89a12a8b00dc77506d702f61300803b6240b7
           path: SHARK-TestSuite
           submodules: false
           lfs: false
       - name: Install external TestSuite Python requirements
         run: |
           source ${VENV_DIR}/bin/activate
           python -m pip install -r SHARK-TestSuite/iree_tests/requirements.txt
           pip install --no-compile --pre --upgrade -e SHARK-TestSuite/common_tools

       - name: Run external tests - ONNX test suite
         run: |
           source ${VENV_DIR}/bin/activate
           pytest SHARK-TestSuite/iree_tests/onnx/ \
               -rpfE \
               --numprocesses ${NUMPROCESSES} \
               --timeout=30 \
               --durations=20 \
               --no-skip-tests-missing-files \
               --config-files=${CONFIG_FILE_PATH} \
               --report-log=${LOG_FILE_PATH}
       - name: "Updating config file with latest XFAIL lists"
         if: failure()
         run: |
           source ${VENV_DIR}/bin/activate
           python SHARK-TestSuite/iree_tests/update_config_xfails.py \
             --log-file=${LOG_FILE_PATH} \
             --config-file=${CONFIG_FILE_PATH}
           cat ${CONFIG_FILE_PATH}
       - name: "Uploading new config file"
         if: failure()
         uses: actions/upload-artifact@v4.3.3
         with:
           name: ${{ matrix.config-file }}
           path: ${{ env.CONFIG_FILE_PATH }}

   test_models:
     name: "test_models :: ${{ matrix.name }}"
     runs-on: ${{ matrix.runs-on }}
     strategy:
       fail-fast: false

       # Note: these jobs should use persistent runners with local caches.
       # Downloading test files (50GB+) without a cache can take 20+ minutes.
       matrix:
         include:
           # CPU
           - name: cpu_llvm_task
             models-config-file: models_cpu_llvm_task.json
             iree-tests-cache: ~/iree_tests_cache
             runs-on: nodai-amdgpu-w7900-x86-64

           # AMD GPU
           - name: amdgpu_rocm_mi250_gfx90a
             models-config-file: models_gpu_rocm_gfx90a.json
             iree-tests-cache: /groups/aig_sharks/iree-tests-cache
             runs-on: nodai-amdgpu-mi250-x86-64
           - name: amdgpu_rocm_mi300_gfx942
             models-config-file: models_gpu_rocm_gfx942.json
             iree-tests-cache: ~/iree_tests_cache
             runs-on: nodai-amdgpu-mi300-x86-64
           - name: amdgpu_vulkan
             models-config-file: models_gpu_vulkan.json
             iree-tests-cache: ~/iree_tests_cache
             runs-on: nodai-amdgpu-w7900-x86-64

           # NVIDIA GPU
           # None at the moment. Could maybe use the persistent a100 runners:
           #   - self-hosted # must come first
           #   - runner-group=${{ needs.setup.outputs.runner-group }}
           #   - environment=${{ needs.setup.outputs.runner-env }}
           #   - a100
           #   - os-family=Linux
           # (note: would need to plumb the presubmit/postsubmit runner-group through to here too)
     env:
       PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
       IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts
       IREE_TEST_FILES: ${{ matrix.iree-tests-cache }}
       IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite
       MODELS_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.models-config-file }}
       VENV_DIR: ${{ github.workspace }}/venv
       LD_LIBRARY_PATH: /home/esaimana/Python-3.11.9
     steps:
       # TODO(saienduri): Find alternative to this temporary step that manipulates permission of github actions
       # directory to be able to clean after every PR
       - name: Pre Checkout MI300 Step
         if: contains(matrix.name, 'gfx942')
         run: |
           sudo chmod -R 777 ~/actions-runner/_work
       - name: Checking out IREE repository
         uses: actions/checkout@v4.1.7
         with:
           submodules: false
       - uses: actions/setup-python@v5.1.0
         with:
           # Must match the subset of versions built in pkgci_build_packages.
           python-version: "3.11"
       - uses: actions/download-artifact@v4.1.7
         with:
           name: linux_x86_64_release_packages
           path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
       - name: Setup venv
         run: |
           ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
             --artifact-path=${PACKAGE_DOWNLOAD_DIR} \
             --fetch-gh-workflow=${{ inputs.artifact_run_id }}

       # Out of tree tests
       - name: Check out external TestSuite repository
         uses: actions/checkout@v4.1.7
         with:
           repository: nod-ai/SHARK-TestSuite
           ref: 1da89a12a8b00dc77506d702f61300803b6240b7
           path: SHARK-TestSuite
           submodules: false
           lfs: true
       - name: Install external TestSuite Python requirements
         run: |
           source ${VENV_DIR}/bin/activate
           python3 -m pip install -r SHARK-TestSuite/iree_tests/requirements.txt
           pip install --no-compile --pre --upgrade -e SHARK-TestSuite/common_tools
       - name: Download remote files for real weight model tests
         run: |
           source ${VENV_DIR}/bin/activate
           python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir iree_tests/pytorch/models
           python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir iree_tests/sharktank

       - name: Run external tests - models with real weights
         if: "matrix.models-config-file != '' && !cancelled()"
         run: |
           source ${VENV_DIR}/bin/activate
           pytest \
             SHARK-TestSuite/iree_tests/pytorch/models \
             SHARK-TestSuite/iree_tests/sharktank \
             -rA \
             -k real_weights \
             --no-skip-tests-missing-files \
             --capture=no \
             --log-cli-level=info \
             --timeout=1200 \
             --durations=0 \
             --config-files=${MODELS_CONFIG_FILE_PATH}

   test_regression_suite:
     name: "test_regression_suite :: ${{ matrix.name }}"
     runs-on: ${{ matrix.runs-on }}
     strategy:
       fail-fast: false

       # Note: these jobs should use persistent runners with local caches.
       # Downloading test files (50GB+) without a cache can take 20+ minutes.
       matrix:
         include:
           # CPU
           - name: cpu_llvm_task
             models-config-file: models_cpu_llvm_task.json
             backend: cpu
             iree-tests-cache: ~/iree_tests_cache
             runs-on: nodai-amdgpu-w7900-x86-64

           # AMD GPU
           - name: amdgpu_rocm_mi250_gfx90a
             rocm-chip: gfx90a
             backend: rocm
             iree-tests-cache: /groups/aig_sharks/iree-tests-cache
             runs-on: nodai-amdgpu-mi250-x86-64
           - name: amdgpu_rocm_mi300_gfx942
             rocm-chip: gfx942
             backend: rocm
             iree-tests-cache: ~/iree_tests_cache
             runs-on: nodai-amdgpu-mi300-x86-64
     env:
       PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
       IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts
       IREE_TEST_FILES: ${{ matrix.iree-tests-cache }}
       IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite
       VENV_DIR: ${{ github.workspace }}/venv
       LD_LIBRARY_PATH: /home/esaimana/Python-3.11.9
     steps:
       # TODO(saienduri): Find alternative to this temporary step that manipulates permission of github actions
       # directory to be able to clean after every PR
       - name: Pre Checkout MI300 Step
         if: contains(matrix.name, 'gfx942')
         run: |
           sudo chmod -R 777 ~/actions-runner/_work
       - name: Checking out IREE repository
         uses: actions/checkout@v4.1.7
         with:
           submodules: false
       - uses: actions/setup-python@v5.1.0
         with:
           # Must match the subset of versions built in pkgci_build_packages.
           python-version: "3.11"
       - uses: actions/download-artifact@v4.1.7
         with:
           name: linux_x86_64_release_packages
           path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
       - name: Setup venv
         run: |
           ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
             --artifact-path=${PACKAGE_DOWNLOAD_DIR} \
             --fetch-gh-workflow=${{ inputs.artifact_run_id }}

       # TODO(#17344): regenerate .mlirbc files, test plat_rdna3_rocm on rocm
       # # In-tree tests
       # - name: Run experimental/regression_suite tests
       #   run: |
       #     source ${VENV_DIR}/bin/activate
       #     pytest \
       #       -rA -s -m "plat_host_cpu and presubmit" \
       #       experimental/regression_suite

       - name: "Running SDXL special model tests"
         if: "!cancelled()"
         run: |
           source ${VENV_DIR}/bin/activate
           pytest ./experimental/regression_suite/shark-test-suite-models/sdxl \
             -k ${{ matrix.backend }} \
             -rpfE \
             --capture=no \
             --log-cli-level=info \
             --timeout=1200 \
             --durations=0
         env:
           ROCM_CHIP: ${{ matrix.rocm-chip }}

       - name: "Running SD3 special model tests"
         if: "!cancelled()"
         run: |
           source ${VENV_DIR}/bin/activate
           pytest ./experimental/regression_suite/shark-test-suite-models/sd3 \
             -k ${{ matrix.backend }} \
             -rpfE \
             --capture=no \
             --log-cli-level=info \
             --timeout=1200 \
             --durations=0
         env:
           ROCM_CHIP: ${{ matrix.rocm-chip }}

       # Note: mi250 benchmark times are more lenient than mi300 (allowing about
       # 10% deviation from observed averages), since the mi250 runners we use
       # are more unstable and we care most about peak performance on mi300.
       - name: "Running SDXL rocm pipeline benchmark (mi250)"
         if: contains(matrix.name, 'rocm_mi250_gfx90a')
         run: |
           source ${VENV_DIR}/bin/activate
           pytest ./experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py \
             --goldentime-rocm-e2e-ms 1450.0 \
             --goldentime-rocm-unet-ms 370.0 \
             --goldentime-rocm-clip-ms 18.5 \
             --goldentime-rocm-vae-ms 315.0 \
             --goldendispatch-rocm-unet 1714 \
             --goldendispatch-rocm-clip 1569 \
             --goldendispatch-rocm-vae 248 \
             --goldensize-rocm-unet-bytes 2280000  \
             --goldensize-rocm-clip-bytes 860000 \
             --goldensize-rocm-vae-bytes 840000 \
             --gpu-number 6 \
             --rocm-chip gfx90a \
             --log-cli-level=info \
             --retries 7
           echo "$(<job_summary.md )" >> $GITHUB_STEP_SUMMARY
           rm job_summary.md

       - name: "Running SDXL rocm pipeline benchmark (mi300)"
         if: contains(matrix.name, 'rocm_mi300_gfx942')
         run: |
           source ${VENV_DIR}/bin/activate
           pytest ./experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py \
             --goldentime-rocm-e2e-ms 325.0 \
             --goldentime-rocm-unet-ms 77.0 \
             --goldentime-rocm-clip-ms 15.5 \
             --goldentime-rocm-vae-ms 74.0 \
             --goldendispatch-rocm-unet 1714 \
             --goldendispatch-rocm-clip 1569 \
             --goldendispatch-rocm-vae 248 \
             --goldensize-rocm-unet-bytes 2270000 \
             --goldensize-rocm-clip-bytes 860000  \
             --goldensize-rocm-vae-bytes 840000 \
             --gpu-number 0 \
             --rocm-chip gfx942 \
             --log-cli-level=info \
             --retries 7
           echo "$(<job_summary.md )" >> $GITHUB_STEP_SUMMARY
	# Copyright 2023 The IREE Authors
	#
	# Licensed under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	name: PkgCI Regression Test
	on:
	workflow_call:
	inputs:
	artifact_run_id:
	type: string
	default: ""
	workflow_dispatch:
	inputs:
	artifact_run_id:
	type: string
	default: ""

	jobs:
	test_onnx:
	name: "test_onnx :: ${{ matrix.name }}"
	runs-on: ${{ matrix.runs-on }}
	strategy:
	fail-fast: false
	matrix:
	include:
	# CPU
	- name: cpu_llvm_sync
	config-file: onnx_cpu_llvm_sync.json
	numprocesses: auto
	runs-on: ubuntu-20.04

	# AMD GPU
	- name: amdgpu_rocm_rdna3
	numprocesses: 1
	config-file: onnx_gpu_rocm_rdna3.json
	runs-on: nodai-amdgpu-w7900-x86-64
	- name: amdgpu_vulkan
	numprocesses: 4
	config-file: onnx_gpu_vulkan.json
	runs-on: nodai-amdgpu-w7900-x86-64

	# NVIDIA GPU
	- name: nvidiagpu_cuda
	config-file: onnx_gpu_cuda.json
	numprocesses: 4
	runs-on:
	- self-hosted # must come first
	- runner-group=${{ github.event_name == 'pull_request' && 'presubmit' \|\| 'postsubmit' }}
	- environment=prod
	- gpu # TODO(scotttodd): qualify further with vendor/model
	- os-family=Linux
	- name: nvidiagpu_vulkan
	config-file: onnx_gpu_vulkan.json
	numprocesses: 4
	runs-on:
	- self-hosted # must come first
	- runner-group=${{ github.event_name == 'pull_request' && 'presubmit' \|\| 'postsubmit' }}
	- environment=prod
	- gpu # TODO(scotttodd): qualify further with vendor/model
	- os-family=Linux
	env:
	PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
	IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts
	CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.config-file }}
	NUMPROCESSES: ${{ matrix.numprocesses }}
	LOG_FILE_PATH: /tmp/iree_tests_onnx_${{ matrix.name }}_logs.json
	VENV_DIR: ${{ github.workspace }}/venv
	steps:
	- name: Checking out IREE repository
	uses: actions/checkout@v4.1.7
	with:
	submodules: false
	- uses: actions/setup-python@v5.1.0
	with:
	# Must match the subset of versions built in pkgci_build_packages.
	python-version: "3.11"
	- uses: actions/download-artifact@v4.1.7
	with:
	name: linux_x86_64_release_packages
	path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
	- name: Setup venv
	run: \|
	./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
	--artifact-path=${PACKAGE_DOWNLOAD_DIR} \
	--fetch-gh-workflow=${{ inputs.artifact_run_id }}

	- name: Check out external TestSuite repository
	uses: actions/checkout@v4.1.7
	with:
	repository: nod-ai/SHARK-TestSuite
	ref: 1da89a12a8b00dc77506d702f61300803b6240b7
	path: SHARK-TestSuite
	submodules: false
	lfs: false
	- name: Install external TestSuite Python requirements
	run: \|
	source ${VENV_DIR}/bin/activate
	python -m pip install -r SHARK-TestSuite/iree_tests/requirements.txt
	pip install --no-compile --pre --upgrade -e SHARK-TestSuite/common_tools

	- name: Run external tests - ONNX test suite
	run: \|
	source ${VENV_DIR}/bin/activate
	pytest SHARK-TestSuite/iree_tests/onnx/ \
	-rpfE \
	--numprocesses ${NUMPROCESSES} \
	--timeout=30 \
	--durations=20 \
	--no-skip-tests-missing-files \
	--config-files=${CONFIG_FILE_PATH} \
	--report-log=${LOG_FILE_PATH}
	- name: "Updating config file with latest XFAIL lists"
	if: failure()
	run: \|
	source ${VENV_DIR}/bin/activate
	python SHARK-TestSuite/iree_tests/update_config_xfails.py \
	--log-file=${LOG_FILE_PATH} \
	--config-file=${CONFIG_FILE_PATH}
	cat ${CONFIG_FILE_PATH}
	- name: "Uploading new config file"
	if: failure()
	uses: actions/upload-artifact@v4.3.3
	with:
	name: ${{ matrix.config-file }}
	path: ${{ env.CONFIG_FILE_PATH }}

	test_models:
	name: "test_models :: ${{ matrix.name }}"
	runs-on: ${{ matrix.runs-on }}
	strategy:
	fail-fast: false

	# Note: these jobs should use persistent runners with local caches.
	# Downloading test files (50GB+) without a cache can take 20+ minutes.
	matrix:
	include:
	# CPU
	- name: cpu_llvm_task
	models-config-file: models_cpu_llvm_task.json
	iree-tests-cache: ~/iree_tests_cache
	runs-on: nodai-amdgpu-w7900-x86-64

	# AMD GPU
	- name: amdgpu_rocm_mi250_gfx90a
	models-config-file: models_gpu_rocm_gfx90a.json
	iree-tests-cache: /groups/aig_sharks/iree-tests-cache
	runs-on: nodai-amdgpu-mi250-x86-64
	- name: amdgpu_rocm_mi300_gfx942
	models-config-file: models_gpu_rocm_gfx942.json
	iree-tests-cache: ~/iree_tests_cache
	runs-on: nodai-amdgpu-mi300-x86-64
	- name: amdgpu_vulkan
	models-config-file: models_gpu_vulkan.json
	iree-tests-cache: ~/iree_tests_cache
	runs-on: nodai-amdgpu-w7900-x86-64

	# NVIDIA GPU
	# None at the moment. Could maybe use the persistent a100 runners:
	# - self-hosted # must come first
	# - runner-group=${{ needs.setup.outputs.runner-group }}
	# - environment=${{ needs.setup.outputs.runner-env }}
	# - a100
	# - os-family=Linux
	# (note: would need to plumb the presubmit/postsubmit runner-group through to here too)
	env:
	PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
	IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts
	IREE_TEST_FILES: ${{ matrix.iree-tests-cache }}
	IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite
	MODELS_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.models-config-file }}
	VENV_DIR: ${{ github.workspace }}/venv
	LD_LIBRARY_PATH: /home/esaimana/Python-3.11.9
	steps:
	# TODO(saienduri): Find alternative to this temporary step that manipulates permission of github actions
	# directory to be able to clean after every PR
	- name: Pre Checkout MI300 Step
	if: contains(matrix.name, 'gfx942')
	run: \|
	sudo chmod -R 777 ~/actions-runner/_work
	- name: Checking out IREE repository
	uses: actions/checkout@v4.1.7
	with:
	submodules: false
	- uses: actions/setup-python@v5.1.0
	with:
	# Must match the subset of versions built in pkgci_build_packages.
	python-version: "3.11"
	- uses: actions/download-artifact@v4.1.7
	with:
	name: linux_x86_64_release_packages
	path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
	- name: Setup venv
	run: \|
	./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
	--artifact-path=${PACKAGE_DOWNLOAD_DIR} \
	--fetch-gh-workflow=${{ inputs.artifact_run_id }}

	# Out of tree tests
	- name: Check out external TestSuite repository
	uses: actions/checkout@v4.1.7
	with:
	repository: nod-ai/SHARK-TestSuite
	ref: 1da89a12a8b00dc77506d702f61300803b6240b7
	path: SHARK-TestSuite
	submodules: false
	lfs: true
	- name: Install external TestSuite Python requirements
	run: \|
	source ${VENV_DIR}/bin/activate
	python3 -m pip install -r SHARK-TestSuite/iree_tests/requirements.txt
	pip install --no-compile --pre --upgrade -e SHARK-TestSuite/common_tools
	- name: Download remote files for real weight model tests
	run: \|
	source ${VENV_DIR}/bin/activate
	python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir iree_tests/pytorch/models
	python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir iree_tests/sharktank

	- name: Run external tests - models with real weights
	if: "matrix.models-config-file != '' && !cancelled()"
	run: \|
	source ${VENV_DIR}/bin/activate
	pytest \
	SHARK-TestSuite/iree_tests/pytorch/models \
	SHARK-TestSuite/iree_tests/sharktank \
	-rA \
	-k real_weights \
	--no-skip-tests-missing-files \
	--capture=no \
	--log-cli-level=info \
	--timeout=1200 \
	--durations=0 \
	--config-files=${MODELS_CONFIG_FILE_PATH}

	test_regression_suite:
	name: "test_regression_suite :: ${{ matrix.name }}"
	runs-on: ${{ matrix.runs-on }}
	strategy:
	fail-fast: false

	# Note: these jobs should use persistent runners with local caches.
	# Downloading test files (50GB+) without a cache can take 20+ minutes.
	matrix:
	include:
	# CPU
	- name: cpu_llvm_task
	models-config-file: models_cpu_llvm_task.json
	backend: cpu
	iree-tests-cache: ~/iree_tests_cache
	runs-on: nodai-amdgpu-w7900-x86-64

	# AMD GPU
	- name: amdgpu_rocm_mi250_gfx90a
	rocm-chip: gfx90a
	backend: rocm
	iree-tests-cache: /groups/aig_sharks/iree-tests-cache
	runs-on: nodai-amdgpu-mi250-x86-64
	- name: amdgpu_rocm_mi300_gfx942
	rocm-chip: gfx942
	backend: rocm
	iree-tests-cache: ~/iree_tests_cache
	runs-on: nodai-amdgpu-mi300-x86-64
	env:
	PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
	IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts
	IREE_TEST_FILES: ${{ matrix.iree-tests-cache }}
	IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite
	VENV_DIR: ${{ github.workspace }}/venv
	LD_LIBRARY_PATH: /home/esaimana/Python-3.11.9
	steps:
	# TODO(saienduri): Find alternative to this temporary step that manipulates permission of github actions
	# directory to be able to clean after every PR
	- name: Pre Checkout MI300 Step
	if: contains(matrix.name, 'gfx942')
	run: \|
	sudo chmod -R 777 ~/actions-runner/_work
	- name: Checking out IREE repository
	uses: actions/checkout@v4.1.7
	with:
	submodules: false
	- uses: actions/setup-python@v5.1.0
	with:
	# Must match the subset of versions built in pkgci_build_packages.
	python-version: "3.11"
	- uses: actions/download-artifact@v4.1.7
	with:
	name: linux_x86_64_release_packages
	path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
	- name: Setup venv
	run: \|
	./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
	--artifact-path=${PACKAGE_DOWNLOAD_DIR} \
	--fetch-gh-workflow=${{ inputs.artifact_run_id }}

	# TODO(#17344): regenerate .mlirbc files, test plat_rdna3_rocm on rocm
	# # In-tree tests
	# - name: Run experimental/regression_suite tests
	# run: \|
	# source ${VENV_DIR}/bin/activate
	# pytest \
	# -rA -s -m "plat_host_cpu and presubmit" \
	# experimental/regression_suite

	- name: "Running SDXL special model tests"
	if: "!cancelled()"
	run: \|
	source ${VENV_DIR}/bin/activate
	pytest ./experimental/regression_suite/shark-test-suite-models/sdxl \
	-k ${{ matrix.backend }} \
	-rpfE \
	--capture=no \
	--log-cli-level=info \
	--timeout=1200 \
	--durations=0
	env:
	ROCM_CHIP: ${{ matrix.rocm-chip }}

	- name: "Running SD3 special model tests"
	if: "!cancelled()"
	run: \|
	source ${VENV_DIR}/bin/activate
	pytest ./experimental/regression_suite/shark-test-suite-models/sd3 \
	-k ${{ matrix.backend }} \
	-rpfE \
	--capture=no \
	--log-cli-level=info \
	--timeout=1200 \
	--durations=0
	env:
	ROCM_CHIP: ${{ matrix.rocm-chip }}

	# Note: mi250 benchmark times are more lenient than mi300 (allowing about
	# 10% deviation from observed averages), since the mi250 runners we use
	# are more unstable and we care most about peak performance on mi300.
	- name: "Running SDXL rocm pipeline benchmark (mi250)"
	if: contains(matrix.name, 'rocm_mi250_gfx90a')
	run: \|
	source ${VENV_DIR}/bin/activate
	pytest ./experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py \
	--goldentime-rocm-e2e-ms 1450.0 \
	--goldentime-rocm-unet-ms 370.0 \
	--goldentime-rocm-clip-ms 18.5 \
	--goldentime-rocm-vae-ms 315.0 \
	--goldendispatch-rocm-unet 1714 \
	--goldendispatch-rocm-clip 1569 \
	--goldendispatch-rocm-vae 248 \
	--goldensize-rocm-unet-bytes 2280000 \
	--goldensize-rocm-clip-bytes 860000 \
	--goldensize-rocm-vae-bytes 840000 \
	--gpu-number 6 \
	--rocm-chip gfx90a \
	--log-cli-level=info \
	--retries 7
	echo "$(<job_summary.md )" >> $GITHUB_STEP_SUMMARY
	rm job_summary.md

	- name: "Running SDXL rocm pipeline benchmark (mi300)"
	if: contains(matrix.name, 'rocm_mi300_gfx942')
	run: \|
	source ${VENV_DIR}/bin/activate
	pytest ./experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py \
	--goldentime-rocm-e2e-ms 325.0 \
	--goldentime-rocm-unet-ms 77.0 \
	--goldentime-rocm-clip-ms 15.5 \
	--goldentime-rocm-vae-ms 74.0 \
	--goldendispatch-rocm-unet 1714 \
	--goldendispatch-rocm-clip 1569 \
	--goldendispatch-rocm-vae 248 \
	--goldensize-rocm-unet-bytes 2270000 \
	--goldensize-rocm-clip-bytes 860000 \
	--goldensize-rocm-vae-bytes 840000 \
	--gpu-number 0 \
	--rocm-chip gfx942 \
	--log-cli-level=info \
	--retries 7
	echo "$(<job_summary.md )" >> $GITHUB_STEP_SUMMARY