blob: 2d6d2c90b4f74ae7e56da376df3dd8c169d5df56 [file] [log] [blame]
# Copyright 2023 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
name: PkgCI Regression Test
on:
workflow_call:
inputs:
artifact_run_id:
type: string
default: ""
workflow_dispatch:
inputs:
artifact_run_id:
type: string
default: ""
jobs:
test_onnx:
name: "test_onnx :: ${{ matrix.name }}"
runs-on: ${{ matrix.runs-on }}
strategy:
fail-fast: false
matrix:
include:
# CPU
- name: cpu_llvm_sync
config-file: onnx_cpu_llvm_sync.json
numprocesses: auto
runs-on: ubuntu-20.04
# AMD GPU
- name: amdgpu_rocm_rdna3
numprocesses: 1
config-file: onnx_gpu_rocm_rdna3.json
runs-on: nodai-amdgpu-w7900-x86-64
- name: amdgpu_vulkan
numprocesses: 4
config-file: onnx_gpu_vulkan.json
runs-on: nodai-amdgpu-w7900-x86-64
# NVIDIA GPU
- name: nvidiagpu_cuda
config-file: onnx_gpu_cuda.json
numprocesses: 4
runs-on:
- self-hosted # must come first
- runner-group=${{ github.event_name == 'pull_request' && 'presubmit' || 'postsubmit' }}
- environment=prod
- gpu # TODO(scotttodd): qualify further with vendor/model
- os-family=Linux
- name: nvidiagpu_vulkan
config-file: onnx_gpu_vulkan.json
numprocesses: 4
runs-on:
- self-hosted # must come first
- runner-group=${{ github.event_name == 'pull_request' && 'presubmit' || 'postsubmit' }}
- environment=prod
- gpu # TODO(scotttodd): qualify further with vendor/model
- os-family=Linux
env:
PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts
CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.config-file }}
NUMPROCESSES: ${{ matrix.numprocesses }}
LOG_FILE_PATH: /tmp/iree_tests_onnx_${{ matrix.name }}_logs.json
VENV_DIR: ${{ github.workspace }}/venv
steps:
- name: Checking out IREE repository
uses: actions/checkout@v4.1.7
with:
submodules: false
- uses: actions/setup-python@v5.1.0
with:
# Must match the subset of versions built in pkgci_build_packages.
python-version: "3.11"
- uses: actions/download-artifact@v4.1.7
with:
name: linux_x86_64_release_packages
path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
- name: Setup venv
run: |
./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
--artifact-path=${PACKAGE_DOWNLOAD_DIR} \
--fetch-gh-workflow=${{ inputs.artifact_run_id }}
- name: Check out external TestSuite repository
uses: actions/checkout@v4.1.7
with:
repository: nod-ai/SHARK-TestSuite
ref: 3603a453b3777fac9af4506a3dc0b3d87587fd47
path: SHARK-TestSuite
submodules: false
lfs: false
- name: Install external TestSuite Python requirements
run: |
source ${VENV_DIR}/bin/activate
python -m pip install -r SHARK-TestSuite/iree_tests/requirements.txt
- name: Run external tests - ONNX test suite
run: |
source ${VENV_DIR}/bin/activate
pytest SHARK-TestSuite/iree_tests/onnx/ \
-rpfE \
--numprocesses ${NUMPROCESSES} \
--timeout=30 \
--durations=20 \
--no-skip-tests-missing-files \
--config-files=${CONFIG_FILE_PATH} \
--report-log=${LOG_FILE_PATH}
- name: "Updating config file with latest XFAIL lists"
if: failure()
run: |
source ${VENV_DIR}/bin/activate
python SHARK-TestSuite/iree_tests/update_config_xfails.py \
--log-file=${LOG_FILE_PATH} \
--config-file=${CONFIG_FILE_PATH}
cat ${CONFIG_FILE_PATH}
- name: "Uploading new config file"
if: failure()
uses: actions/upload-artifact@v4.3.3
with:
name: ${{ matrix.config-file }}
path: ${{ env.CONFIG_FILE_PATH }}
test_models:
name: "test_models :: ${{ matrix.name }}"
runs-on: ${{ matrix.runs-on }}
strategy:
fail-fast: false
# Note: these jobs should use persistent runners with local caches.
# Downloading test files (50GB+) without a cache can take 20+ minutes.
matrix:
include:
# CPU
- name: cpu_llvm_task
models-config-file: models_cpu_llvm_task.json
sdxl-unet-config-file: sdxl_scheduled_unet_cpu_llvm_task.json
sdxl-vae-config-file: sdxl_vae_decode_cpu_llvm_task.json
sdxl-clip-config-file: sdxl_prompt_encoder_cpu_llvm_task.json
runs-on: nodai-amdgpu-w7900-x86-64
# AMD GPU
- name: amdgpu_rocm_mi250_gfx90a
models-config-file: models_gpu_rocm_gfx90a.json
models-extra-flags-config-file: models_gpu_rocm_gfx90a_additional_flags.json
sdxl-unet-config-file: sdxl_scheduled_unet_gpu_rocm_gfx90a.json
sdxl-vae-config-file: sdxl_vae_decode_gpu_rocm_gfx90a.json
sdxl-clip-config-file: sdxl_prompt_encoder_gpu_rocm_gfx90a.json
runs-on: nodai-amdgpu-mi250-x86-64
- name: amdgpu_rocm_mi300_gfx942
models-config-file: models_gpu_rocm_gfx942.json
models-extra-flags-config-file: models_gpu_rocm_gfx942_additional_flags.json
sdxl-unet-config-file: sdxl_scheduled_unet_gpu_rocm_gfx942.json
sdxl-vae-config-file: sdxl_vae_decode_gpu_rocm_gfx942.json
sdxl-clip-config-file: sdxl_prompt_encoder_gpu_rocm_gfx942.json
runs-on: nodai-amdgpu-mi300-x86-64
- name: amdgpu_vulkan
models-config-file: models_gpu_vulkan.json
runs-on: nodai-amdgpu-w7900-x86-64
# NVIDIA GPU
# None at the moment. Could maybe use the persistent a100 runners:
# - self-hosted # must come first
# - runner-group=${{ needs.setup.outputs.runner-group }}
# - environment=${{ needs.setup.outputs.runner-env }}
# - a100
# - os-family=Linux
# (note: would need to plumb the presubmit/postsubmit runner-group through to here too)
env:
PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts
IREE_TEST_FILES: ~/iree_tests_cache
IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite
MODELS_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.models-config-file }}
MODELS_EXTRA_FLAGS_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.models-extra-flags-config-file }}
SDXL_UNET_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-unet-config-file }}
SDXL_CLIP_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-clip-config-file }}
SDXL_VAE_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-vae-config-file }}
VENV_DIR: ${{ github.workspace }}/venv
LD_LIBRARY_PATH: /home/esaimana/Python-3.11.9
steps:
# TODO(saienduri): Find alternative to this temporary step that manipulates permission of github actions
# directory to be able to clean after every PR
- name: Pre Checkout MI300 Step
if: contains(matrix.name, 'gfx942')
run: |
sudo chmod -R 777 ~/actions-runner/_work
- name: Checking out IREE repository
uses: actions/checkout@v4.1.7
with:
submodules: false
- uses: actions/setup-python@v5.1.0
with:
# Must match the subset of versions built in pkgci_build_packages.
python-version: "3.11"
- uses: actions/download-artifact@v4.1.7
with:
name: linux_x86_64_release_packages
path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
- name: Setup venv
run: |
./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
--artifact-path=${PACKAGE_DOWNLOAD_DIR} \
--fetch-gh-workflow=${{ inputs.artifact_run_id }}
# TODO(#17344): regenerate .mlirbc files, test plat_rdna3_rocm on rocm
# # In-tree tests
# - name: Run experimental/regression_suite tests
# run: |
# source ${VENV_DIR}/bin/activate
# pytest \
# -rA -s -m "plat_host_cpu and presubmit" \
# experimental/regression_suite
# Out of tree tests
- name: Check out external TestSuite repository
uses: actions/checkout@v4.1.7
with:
repository: nod-ai/SHARK-TestSuite
ref: 3603a453b3777fac9af4506a3dc0b3d87587fd47
path: SHARK-TestSuite
submodules: false
lfs: true
- name: Install external TestSuite Python requirements
run: |
source ${VENV_DIR}/bin/activate
python -m pip install -r SHARK-TestSuite/iree_tests/requirements.txt
- name: Download remote files for real weight model tests
run: |
source ${VENV_DIR}/bin/activate
python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir pytorch/models
python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir sharktank
- name: Run external tests - models with real weights
if: "matrix.models-config-file != '' && !cancelled()"
run: |
source ${VENV_DIR}/bin/activate
pytest \
SHARK-TestSuite/iree_tests/pytorch/models \
SHARK-TestSuite/iree_tests/sharktank \
-rpfE \
-k real_weights \
--no-skip-tests-missing-files \
--capture=no \
--log-cli-level=info \
--timeout=1200 \
--durations=0 \
--config-files=${MODELS_CONFIG_FILE_PATH}
- name: Run external tests - models with real weights and additional flags
if: "matrix.models-extra-flags-config-file != '' && !cancelled()"
run: |
source ${VENV_DIR}/bin/activate
pytest SHARK-TestSuite/iree_tests/pytorch/models \
-rpfE \
-k real_weights \
--no-skip-tests-missing-files \
--capture=no \
--log-cli-level=info \
--timeout=1200 \
--durations=0 \
--config-files=${MODELS_EXTRA_FLAGS_CONFIG_FILE_PATH}
- name: "Run external tests - SDXL scheduled unet"
if: "matrix.sdxl-unet-config-file != '' && !cancelled()"
run: |
source ${VENV_DIR}/bin/activate
pytest SHARK-TestSuite/iree_tests/pytorch/models/sdxl-scheduled-unet-3-tank \
-rpfE \
-k real_weights \
--no-skip-tests-missing-files \
--capture=no \
--log-cli-level=info \
--timeout=1200 \
--durations=0 \
--config-files=${SDXL_UNET_CONFIG_FILE_PATH}
- name: "Run external tests - SDXL prompt encoder"
if: "matrix.sdxl-clip-config-file != '' && !cancelled()"
run: |
source ${VENV_DIR}/bin/activate
pytest SHARK-TestSuite/iree_tests/pytorch/models/sdxl-prompt-encoder-tank \
-rpfE \
-k real_weights \
--no-skip-tests-missing-files \
--capture=no \
--log-cli-level=info \
--timeout=1200 \
--durations=0 \
--config-files=${SDXL_CLIP_CONFIG_FILE_PATH}
- name: "Run external tests - SDXL vae decode"
if: "matrix.sdxl-vae-config-file != '' && !cancelled()"
run: |
source ${VENV_DIR}/bin/activate
pytest SHARK-TestSuite/iree_tests/pytorch/models/sdxl-vae-decode-tank \
-rpfE \
-k real_weights \
--no-skip-tests-missing-files \
--capture=no \
--log-cli-level=info \
--timeout=1200 \
--durations=0 \
--config-files=${SDXL_VAE_CONFIG_FILE_PATH}
- name: "Running SDXL rocm pipeline benchmark (mi250)"
if: contains(matrix.name, 'rocm_mi250_gfx90a')
run: |
source ${VENV_DIR}/bin/activate
pytest SHARK-TestSuite/iree_tests/benchmarks/sdxl/benchmark_sdxl_rocm.py \
--goldentime-rocm-e2e-ms 1336.0 \
--goldentime-rocm-unet-ms 340.0 \
--goldentime-rocm-clip-ms 17.5 \
--goldentime-rocm-vae-ms 300.0 \
--goldendispatch-rocm-unet 1714 \
--goldendispatch-rocm-clip 1569 \
--goldendispatch-rocm-vae 248 \
--goldensize-rocm-unet-bytes 2073609 \
--goldensize-rocm-clip-bytes 783720 \
--goldensize-rocm-vae-bytes 764909 \
--gpu-number 6 \
--rocm-chip gfx90a \
--log-cli-level=info \
--retries 7
echo "$(<job_summary.md )" >> $GITHUB_STEP_SUMMARY
rm job_summary.md
- name: "Running SDXL rocm pipeline benchmark (mi300)"
if: contains(matrix.name, 'rocm_mi300_gfx942')
run: |
source ${VENV_DIR}/bin/activate
pytest SHARK-TestSuite/iree_tests/benchmarks/sdxl/benchmark_sdxl_rocm.py \
--goldentime-rocm-e2e-ms 320 \
--goldentime-rocm-unet-ms 77 \
--goldentime-rocm-clip-ms 15 \
--goldentime-rocm-vae-ms 74 \
--goldendispatch-rocm-unet 1714 \
--goldendispatch-rocm-clip 1569 \
--goldendispatch-rocm-vae 248 \
--goldensize-rocm-unet-bytes 2054938 \
--goldensize-rocm-clip-bytes 780328 \
--goldensize-rocm-vae-bytes 758509 \
--gpu-number 0 \
--rocm-chip gfx942 \
--log-cli-level=info \
--retries 7
echo "$(<job_summary.md )" >> $GITHUB_STEP_SUMMARY