Enable MI300 CI testing. (#17842)

This commit enables mi300 gpu and model testing.

ci-exactly: build_all, test_amd_mi300, build_packages, regression_test

---------

Signed-off-by: saienduri <saimanas.enduri@amd.com>
Co-authored-by: Scott Todd <scott.todd0@gmail.com>
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8c69c43..74b3d90 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -430,6 +430,50 @@
         run: |
           ./build_tools/cmake/ctest_all.sh ${BUILD_DIR}
 
+  test_amd_mi300:
+    needs: [setup, build_all]
+    if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'test_amd_mi300')
+    env:
+      BUILD_DIR: build-tests
+      INSTALL_DIR: ${{ needs.build_all.outputs.install-dir }}
+      INSTALL_DIR_ARCHIVE: ${{ needs.build_all.outputs.install-dir-archive }}
+      INSTALL_DIR_GCS_URL: ${{ needs.build_all.outputs.install-dir-gcs-url }}
+      IREE_CPU_DISABLE: 1
+      IREE_VULKAN_DISABLE: 1
+      IREE_CUDA_DISABLE: 1
+      IREE_HIP_DISABLE: 0
+      IREE_HIP_TEST_TARGET_CHIP: "gfx942"
+      LD_LIBRARY_PATH: /home/esaimana/Python-3.11.9
+    runs-on: nodai-amdgpu-mi300-x86-64
+    steps:
+      - name: Pre Checkout MI300 Step
+        if: contains(matrix.name, 'gfx942')
+        run: |
+          sudo chmod -R 777 ~/actions-runner/_work
+      - name: "Checking out repository"
+        uses: actions/checkout@v4.1.7
+      - name: "Checking out runtime submodules"
+        run: ./build_tools/scripts/git/update_runtime_submodules.sh
+      - name: "Downloading install dir archive"
+        run: wget "${INSTALL_DIR_GCS_URL}" -O "${INSTALL_DIR_ARCHIVE}"
+      - name: "Extracting install directory"
+        run: tar -xf "${INSTALL_DIR_ARCHIVE}"
+      - name: "Building tests"
+        run: |
+          ./build_tools/pkgci/build_tests_using_package.sh ${INSTALL_DIR}
+      - name: "Running GPU tests"
+        env:
+          IREE_CTEST_LABEL_REGEX: ^requires-gpu|^driver=hip$
+          IREE_NVIDIA_SM80_TESTS_DISABLE: 1
+          IREE_MULTI_DEVICE_TESTS_DISABLE: 0
+          IREE_AMD_RDNA3_TESTS_DISABLE: 1
+          IREE_NVIDIA_GPU_TESTS_DISABLE: 0
+          IREE_CUDA_DISABLE: 1
+          IREE_CPU_DISABLE: 1
+          IREE_HIP_DISABLE: 0
+        run: |
+          ./build_tools/cmake/ctest_all.sh ${BUILD_DIR}
+
   test_amd_w7900:
     needs: [setup, build_all]
     if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'test_amd_w7900')
@@ -939,6 +983,7 @@
       - test_nvidia_gpu
       - test_nvidia_a100
       - test_amd_mi250
+      - test_amd_mi300
       - test_amd_w7900
 
       # Configurations
diff --git a/.github/workflows/pkgci_regression_test.yml b/.github/workflows/pkgci_regression_test.yml
index 99ee321..9050382 100644
--- a/.github/workflows/pkgci_regression_test.yml
+++ b/.github/workflows/pkgci_regression_test.yml
@@ -144,13 +144,20 @@
             runs-on: nodai-amdgpu-w7900-x86-64
 
           # AMD GPU
-          - name: amdgpu_rocm_gfx90a
+          - name: amdgpu_rocm_mi250_gfx90a
             models-config-file: models_gpu_rocm_gfx90a.json
             models-extra-flags-config-file: models_gpu_rocm_gfx90a_additional_flags.json
             sdxl-unet-config-file: sdxl_scheduled_unet_gpu_rocm_gfx90a.json
             sdxl-vae-config-file: sdxl_vae_decode_gpu_rocm_gfx90a.json
             sdxl-clip-config-file: sdxl_prompt_encoder_gpu_rocm_gfx90a.json
             runs-on: nodai-amdgpu-mi250-x86-64
+          - name: amdgpu_rocm_mi300_gfx942
+            models-config-file: models_gpu_rocm_gfx942.json
+            models-extra-flags-config-file: models_gpu_rocm_gfx942_additional_flags.json
+            sdxl-unet-config-file: sdxl_scheduled_unet_gpu_rocm_gfx942.json
+            sdxl-vae-config-file: sdxl_vae_decode_gpu_rocm_gfx942.json
+            sdxl-clip-config-file: sdxl_prompt_encoder_gpu_rocm_gfx942.json
+            runs-on: nodai-amdgpu-mi300-x86-64
           - name: amdgpu_vulkan
             models-config-file: models_gpu_vulkan.json
             runs-on: nodai-amdgpu-w7900-x86-64
@@ -174,7 +181,14 @@
       SDXL_CLIP_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-clip-config-file }}
       SDXL_VAE_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-vae-config-file }}
       VENV_DIR: ${{ github.workspace }}/venv
+      LD_LIBRARY_PATH: /home/esaimana/Python-3.11.9
     steps:
+      # TODO(saienduri): Find alternative to this temporary step that manipulates permission of github actions
+      # directory to be able to clean after every PR
+      - name: Pre Checkout MI300 Step
+        if: contains(matrix.name, 'gfx942')
+        run: |
+          sudo chmod -R 777 ~/actions-runner/_work
       - name: Checking out IREE repository
         uses: actions/checkout@v4.1.7
         with:
@@ -293,8 +307,8 @@
             --durations=0 \
             --config-files=${SDXL_VAE_CONFIG_FILE_PATH}
 
-      - name: "Running SDXL rocm pipeline benchmark"
-        if: contains(matrix.name, 'rocm')
+      - name: "Running SDXL rocm pipeline benchmark (mi250)"
+        if: contains(matrix.name, 'rocm_mi250_gfx90a')
         run: |
           source ${VENV_DIR}/bin/activate
           pytest SHARK-TestSuite/iree_tests/benchmarks/sdxl/benchmark_sdxl_rocm.py \
@@ -313,3 +327,25 @@
             --log-cli-level=info \
             --retries 7
           echo "$(<job_summary.md )" >> $GITHUB_STEP_SUMMARY
+          rm job_summary.md
+
+      - name: "Running SDXL rocm pipeline benchmark (mi300)"
+        if: contains(matrix.name, 'rocm_mi300_gfx942')
+        run: |
+          source ${VENV_DIR}/bin/activate
+          pytest SHARK-TestSuite/iree_tests/benchmarks/sdxl/benchmark_sdxl_rocm.py \
+            --goldentime-rocm-e2e-ms 320 \
+            --goldentime-rocm-unet-ms 77 \
+            --goldentime-rocm-clip-ms 15 \
+            --goldentime-rocm-vae-ms 74 \
+            --goldendispatch-rocm-unet 1714 \
+            --goldendispatch-rocm-clip 1569 \
+            --goldendispatch-rocm-vae 248 \
+            --goldensize-rocm-unet-bytes 2054938 \
+            --goldensize-rocm-clip-bytes 780328 \
+            --goldensize-rocm-vae-bytes 758509 \
+            --gpu-number 0 \
+            --rocm-chip gfx942 \
+            --log-cli-level=info \
+            --retries 7
+          echo "$(<job_summary.md )" >> $GITHUB_STEP_SUMMARY
diff --git a/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942.json b/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942.json
new file mode 100644
index 0000000..5d451f0
--- /dev/null
+++ b/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942.json
@@ -0,0 +1,28 @@
+{
+  "config_name": "gpu_rocm",
+  "iree_compile_flags": [
+    "--iree-hal-target-backends=rocm",
+    "--iree-rocm-target-chip=gfx942",
+    "--iree-input-demote-f64-to-f32"
+  ],
+  "iree_run_module_flags": [
+    "--device=hip"
+  ],
+  "skip_compile_tests": [
+    "pytorch/models/sdxl-scheduled-unet-3-tank",
+    "pytorch/models/sdxl-prompt-encoder-tank",
+    "pytorch/models/sdxl-vae-decode-tank"
+  ],
+  "skip_run_tests": [],
+  "expected_compile_failures": [
+    // TODO(#17344): need to regenerate .mlirbc
+    "pytorch/models/opt-125M",
+    "pytorch/models/resnet50",
+    "pytorch/models/sdxl-vae-decode-tank",
+
+    // error: 'builtin.module' op failed to run transform dialect passes
+    // (transform spec file is specific to SDXL?)
+    "sharktank/llama/open-llama-3b-v2-f16"
+  ],
+  "expected_run_failures": []
+}
diff --git a/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942_additional_flags.json b/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942_additional_flags.json
new file mode 100644
index 0000000..28950d0
--- /dev/null
+++ b/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942_additional_flags.json
@@ -0,0 +1,25 @@
+{
+  "config_name": "gpu_rocm",
+  "iree_compile_flags": [
+    "--iree-hal-target-backends=rocm",
+    "--iree-rocm-target-chip=gfx942",
+    "--iree-input-demote-f64-to-f32",
+    "--iree-opt-const-eval=false",
+    "--iree-codegen-transform-dialect-library=${IREE_TEST_PATH_EXTENSION}/attention_and_matmul_spec.mlir"
+  ],
+  "iree_run_module_flags": [
+    "--device=hip"
+  ],
+  "skip_compile_tests": [
+    "pytorch/models/sdxl-scheduled-unet-3-tank",
+    "pytorch/models/sdxl-prompt-encoder-tank",
+    "pytorch/models/sdxl-vae-decode-tank"
+  ],
+  "skip_run_tests": [],
+  "expected_compile_failures": [
+    // TODO(#17344): need to regenerate .mlirbc
+    "pytorch/models/opt-125M",
+    "pytorch/models/resnet50"
+  ],
+  "expected_run_failures": []
+}
diff --git a/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx942.json b/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx942.json
new file mode 100644
index 0000000..e3dbc9b
--- /dev/null
+++ b/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx942.json
@@ -0,0 +1,36 @@
+{
+  "config_name": "gpu_rocm",
+  "iree_compile_flags": [
+    "--iree-hal-target-backends=rocm",
+    "--iree-rocm-target-chip=gfx942",
+    "--iree-input-type=torch",
+    "--iree-opt-const-eval=false",
+    "--iree-global-opt-propagate-transposes=true",
+    "--iree-opt-outer-dim-concat=true",
+    "--iree-rocm-waves-per-eu=2",
+    "--iree-llvmgpu-enable-prefetch",
+    "--iree-flow-enable-aggressive-fusion",
+    "--iree-global-opt-enable-fuse-horizontal-contractions=true",
+    "--iree-opt-aggressively-propagate-transposes=true",
+    "--iree-codegen-llvmgpu-use-vector-distribution=true",
+    "--iree-execution-model=async-external",
+    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics{pad-target-type=conv}))",
+    "--iree-scheduling-dump-statistics-format=json",
+    "--iree-scheduling-dump-statistics-file=compilation_info.json"
+  ],
+  "iree_run_module_flags": [
+    "--device=hip",
+    "--parameters=model=real_weights.irpa",
+    "--input=1x64xi64=@inference_input.0.bin",
+    "--input=1x64xi64=@inference_input.1.bin",
+    "--input=1x64xi64=@inference_input.2.bin",
+    "--input=1x64xi64=@inference_input.3.bin",
+    "--expected_output=2x64x2048xf16=@inference_output.0.bin",
+    "--expected_output=2x1280xf16=@inference_output.1.bin",
+    "--expected_f16_threshold=1.0f"
+  ],
+  "skip_compile_tests": [],
+  "skip_run_tests": [],
+  "expected_compile_failures": [],
+  "expected_run_failures": []
+}
diff --git a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx942.json b/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx942.json
new file mode 100644
index 0000000..289e99b
--- /dev/null
+++ b/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx942.json
@@ -0,0 +1,41 @@
+{
+  "config_name": "gpu_rocm",
+  "iree_compile_flags" : [
+    "--iree-hal-target-backends=rocm",
+    "--iree-rocm-target-chip=gfx942",
+    "--iree-opt-const-eval=false",
+    "--iree-codegen-transform-dialect-library=${IREE_TEST_PATH_EXTENSION}/attention_and_matmul_spec.mlir",
+    "--iree-global-opt-propagate-transposes=true",
+    "--iree-global-opt-enable-fuse-horizontal-contractions=true",
+    "--iree-flow-enable-aggressive-fusion=true",
+    "--iree-opt-aggressively-propagate-transposes=true",
+    "--iree-opt-outer-dim-concat=true",
+    "--iree-vm-target-truncate-unsupported-floats",
+    "--iree-llvmgpu-enable-prefetch=true",
+    "--iree-opt-data-tiling=false",
+    "--iree-codegen-gpu-native-math-precision=true",
+    "--iree-codegen-llvmgpu-use-vector-distribution",
+    "--iree-rocm-waves-per-eu=2",
+    "--iree-execution-model=async-external",
+    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
+    "--iree-scheduling-dump-statistics-format=json",
+    "--iree-scheduling-dump-statistics-file=compilation_info.json"
+  ],
+  "iree_run_module_flags": [
+    "--device=hip",
+    "--parameters=model=real_weights.irpa",
+    "--module=sdxl_scheduled_unet_pipeline_fp16_rocm.vmfb",
+    "--input=1x4x128x128xf16=@inference_input.0.bin",
+    "--input=2x64x2048xf16=@inference_input.1.bin",
+    "--input=2x1280xf16=@inference_input.2.bin",
+    "--input=1xf16=@inference_input.3.bin",
+    "--expected_output=1x4x128x128xf16=@inference_output.0.bin",
+    "--expected_f16_threshold=0.7f"
+  ],
+  "skip_compile_tests": [],
+  "skip_run_tests": [],
+  "expected_compile_failures": [],
+  "expected_run_failures": [
+    "pytorch/models/sdxl-scheduled-unet-3-tank",
+  ]
+}
diff --git a/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx942.json b/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx942.json
new file mode 100644
index 0000000..1ea7251
--- /dev/null
+++ b/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx942.json
@@ -0,0 +1,29 @@
+{
+  "config_name": "gpu_rocm",
+  "iree_compile_flags" : [
+    "--iree-hal-target-backends=rocm",
+    "--iree-rocm-target-chip=gfx942",
+    "--iree-opt-const-eval=false",
+    "--iree-global-opt-propagate-transposes=true",
+    "--iree-opt-outer-dim-concat=true",
+    "--iree-llvmgpu-enable-prefetch=true",
+    "--iree-rocm-waves-per-eu=2",
+    "--iree-flow-enable-aggressive-fusion",
+    "--iree-codegen-llvmgpu-use-vector-distribution=true",
+    "--iree-execution-model=async-external",
+    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
+    "--iree-scheduling-dump-statistics-format=json",
+    "--iree-scheduling-dump-statistics-file=compilation_info.json"
+  ],
+  "iree_run_module_flags": [
+    "--device=hip",
+    "--parameters=model=real_weights.irpa",
+    "--input=1x4x128x128xf16=@inference_input.0.bin",
+    "--expected_output=1x3x1024x1024xf16=@inference_output.0.bin",
+    "--expected_f16_threshold=0.4f"
+  ],
+  "skip_compile_tests": [],
+  "skip_run_tests": [],
+  "expected_compile_failures": [],
+  "expected_run_failures": []
+}