[Test] Add onnx_ops test suites with O2/O3 optimization level. (#21838)

It helps capture failures earlier because users typically use O2/O3 for
model performance. The main difference is that the aggressive fusion is
enabled starting from O2.

---------

Signed-off-by: hanhanW <hanhan0912@gmail.com>
diff --git a/.github/workflows/pkgci_test_onnx.yml b/.github/workflows/pkgci_test_onnx.yml
index 6df31d1..25493d6 100644
--- a/.github/workflows/pkgci_test_onnx.yml
+++ b/.github/workflows/pkgci_test_onnx.yml
@@ -26,33 +26,41 @@
       matrix:
         include:
           # CPU
-          - name: cpu_llvm_sync
-            config-file: onnx_ops_cpu_llvm_sync.json
+          - name: cpu_llvm_sync_O0
+            config-file: onnx_ops_cpu_llvm_sync_O0.json
+            numprocesses: auto
+            runs-on: ubuntu-24.04
+          - name: cpu_llvm_sync_O2
+            config-file: onnx_ops_cpu_llvm_sync_O2.json
             numprocesses: auto
             runs-on: ubuntu-24.04
 
           # AMD GPU
-          - name: amdgpu_hip_rdna3
+          - name: amdgpu_hip_rdna3_O0
             numprocesses: 1
-            config-file: onnx_ops_gpu_hip_rdna3.json
+            config-file: onnx_ops_gpu_hip_rdna3_O0.json
             runs-on: nodai-amdgpu-w7900-x86-64
-          - name: amdgpu_vulkan
+          - name: amdgpu_hip_rdna3_O3
+            numprocesses: 1
+            config-file: onnx_ops_gpu_hip_rdna3_O3.json
+            runs-on: nodai-amdgpu-w7900-x86-64
+          - name: amdgpu_vulkan_O0
             numprocesses: 4
-            config-file: onnx_ops_gpu_vulkan.json
+            config-file: onnx_ops_gpu_vulkan_O0.json
             runs-on: nodai-amdgpu-w7900-x86-64
 
           # NVIDIA GPU
           # TODO(#18238): migrate to new runner cluster
-          # - name: nvidiagpu_cuda
-          #   config-file: onnx_ops_gpu_cuda.json
+          # - name: nvidiagpu_cuda_O0
+          #   config-file: onnx_ops_gpu_cuda_O0.json
           #   numprocesses: 4
           #   runs-on:
           #     - self-hosted # must come first
           #     - environment=prod
           #     - gpu # TODO(scotttodd): qualify further with vendor/model
           #     - os-family=Linux
-          # - name: nvidiagpu_vulkan
-          #   config-file: onnx_ops_gpu_vulkan.json
+          # - name: nvidiagpu_vulkan_O0
+          #   config-file: onnx_ops_gpu_vulkan_O0.json
           #   numprocesses: 4
           #   runs-on:
           #     - self-hosted # must come first
diff --git a/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json b/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync_O0.json
similarity index 99%
rename from tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json
rename to tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync_O0.json
index 58a32c3..e10fbc7 100644
--- a/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json
+++ b/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync_O0.json
@@ -3,7 +3,8 @@
   "iree_compile_flags": [
     "--iree-hal-target-device=local",
     "--iree-hal-local-target-device-backends=llvm-cpu",
-    "--iree-input-demote-f64-to-f32=false"
+    "--iree-input-demote-f64-to-f32=false",
+    "--iree-opt-level=O0"
   ],
   "iree_run_module_flags": [
     "--device=local-sync"
diff --git a/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json b/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync_O2.json
similarity index 91%
copy from tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json
copy to tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync_O2.json
index 58a32c3..d5d18d8 100644
--- a/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync.json
+++ b/tests/external/iree-test-suites/onnx_ops/onnx_ops_cpu_llvm_sync_O2.json
@@ -3,7 +3,8 @@
   "iree_compile_flags": [
     "--iree-hal-target-device=local",
     "--iree-hal-local-target-device-backends=llvm-cpu",
-    "--iree-input-demote-f64-to-f32=false"
+    "--iree-input-demote-f64-to-f32=false",
+    "--iree-opt-level=O2"
   ],
   "iree_run_module_flags": [
     "--device=local-sync"
@@ -81,6 +82,25 @@
     "onnx/node/generated/test_image_decoder_decode_pnm_rgb",
     "onnx/node/generated/test_image_decoder_decode_tiff_rgb",
     "onnx/node/generated/test_image_decoder_decode_webp_rgb",
+    "onnx/node/generated/test_layer_normalization_2d_axis1",
+    "onnx/node/generated/test_layer_normalization_2d_axis_negative_1",
+    "onnx/node/generated/test_layer_normalization_3d_axis1_epsilon",
+    "onnx/node/generated/test_layer_normalization_3d_axis2_epsilon",
+    "onnx/node/generated/test_layer_normalization_3d_axis_negative_1_epsilon",
+    "onnx/node/generated/test_layer_normalization_3d_axis_negative_2_epsilon",
+    "onnx/node/generated/test_layer_normalization_4d_axis1",
+    "onnx/node/generated/test_layer_normalization_4d_axis2",
+    "onnx/node/generated/test_layer_normalization_4d_axis3",
+    "onnx/node/generated/test_layer_normalization_4d_axis_negative_1",
+    "onnx/node/generated/test_layer_normalization_4d_axis_negative_2",
+    "onnx/node/generated/test_layer_normalization_4d_axis_negative_3",
+    "onnx/node/generated/test_layer_normalization_default_axis",
+    "onnx/node/generated/test_logsoftmax_axis_0",
+    "onnx/node/generated/test_logsoftmax_axis_0_expanded",
+    "onnx/node/generated/test_logsoftmax_axis_0_expanded_ver18",
+    "onnx/node/generated/test_logsoftmax_axis_1",
+    "onnx/node/generated/test_logsoftmax_axis_1_expanded",
+    "onnx/node/generated/test_logsoftmax_axis_1_expanded_ver18",
     "onnx/node/generated/test_loop11",
     "onnx/node/generated/test_lppool_2d_dilations",
     "onnx/node/generated/test_lppool_2d_same_lower",
@@ -106,6 +126,7 @@
     "onnx/node/generated/test_nllloss_NCd1d2_with_weight_reduction_sum_ii",
     "onnx/node/generated/test_nllloss_NCd1d2d3_none_no_weight_negative_ii",
     "onnx/node/generated/test_nllloss_NCd1d2d3d4d5_mean_weight",
+    "onnx/node/generated/test_nllloss_NCd1d2d3d4d5_mean_weight_expanded",
     "onnx/node/generated/test_nllloss_NCd1d2d3d4d5_none_no_weight",
     "onnx/node/generated/test_nonmaxsuppression_two_classes",
     "onnx/node/generated/test_nonzero_example",
@@ -292,6 +313,12 @@
     "onnx/node/generated/test_slice_neg_steps",
     "onnx/node/generated/test_slice_negative_axes",
     "onnx/node/generated/test_slice_start_out_of_bounds",
+    "onnx/node/generated/test_softmax_axis_0",
+    "onnx/node/generated/test_softmax_axis_0_expanded",
+    "onnx/node/generated/test_softmax_axis_0_expanded_ver18",
+    "onnx/node/generated/test_softmax_axis_1",
+    "onnx/node/generated/test_softmax_axis_1_expanded",
+    "onnx/node/generated/test_softmax_axis_1_expanded_ver18",
     "onnx/node/generated/test_stft",
     "onnx/node/generated/test_training_dropout",
     "onnx/node/generated/test_training_dropout_default",
@@ -327,8 +354,12 @@
     "onnx/node/generated/test_gridsample_nearest_align_corners_0_additional_1",
     "onnx/node/generated/test_gridsample_nearest_align_corners_1_additional_1",
     "onnx/node/generated/test_lstm_with_peepholes",
+    "onnx/node/generated/test_nonmaxsuppression_center_point_box_format",
     "onnx/node/generated/test_nonmaxsuppression_flipped_coordinates",
+    "onnx/node/generated/test_nonmaxsuppression_identical_boxes",
+    "onnx/node/generated/test_nonmaxsuppression_limit_output_size",
     "onnx/node/generated/test_nonmaxsuppression_single_box",
+    "onnx/node/generated/test_nonmaxsuppression_suppress_by_IOU",
     "onnx/node/generated/test_nonmaxsuppression_suppress_by_IOU_and_scores",
     "onnx/node/generated/test_pow",
     "onnx/node/generated/test_pow_example",
diff --git a/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_cuda.json b/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_cuda_O0.json
similarity index 99%
rename from tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_cuda.json
rename to tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_cuda_O0.json
index dba0c5b..758590f 100644
--- a/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_cuda.json
+++ b/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_cuda_O0.json
@@ -2,7 +2,8 @@
   "config_name": "gpu_cuda_t4",
   "iree_compile_flags": [
     "--iree-hal-target-device=cuda",
-    "--iree-input-demote-f64-to-f32=false"
+    "--iree-input-demote-f64-to-f32=false",
+    "--iree-opt-level=O0"
   ],
   "iree_run_module_flags": [
     "--device=cuda"
diff --git a/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_hip_rdna3.json b/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_hip_rdna3_O0.json
similarity index 99%
rename from tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_hip_rdna3.json
rename to tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_hip_rdna3_O0.json
index 1ee7bfd..7780cc4 100644
--- a/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_hip_rdna3.json
+++ b/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_hip_rdna3_O0.json
@@ -3,7 +3,8 @@
   "iree_compile_flags": [
     "--iree-hal-target-device=hip",
     "--iree-hip-target=gfx1100",
-    "--iree-input-demote-f64-to-f32=false"
+    "--iree-input-demote-f64-to-f32=false",
+    "--iree-opt-level=O0"
   ],
   "iree_run_module_flags": [
     "--device=hip"
diff --git a/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_hip_rdna3.json b/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_hip_rdna3_O3.json
similarity index 98%
copy from tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_hip_rdna3.json
copy to tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_hip_rdna3_O3.json
index 1ee7bfd..ec771f4 100644
--- a/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_hip_rdna3.json
+++ b/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_hip_rdna3_O3.json
@@ -3,7 +3,8 @@
   "iree_compile_flags": [
     "--iree-hal-target-device=hip",
     "--iree-hip-target=gfx1100",
-    "--iree-input-demote-f64-to-f32=false"
+    "--iree-input-demote-f64-to-f32=false",
+    "--iree-opt-level=O3"
   ],
   "iree_run_module_flags": [
     "--device=hip"
@@ -318,8 +319,12 @@
     "onnx/node/generated/test_gridsample_nearest_align_corners_0_additional_1",
     "onnx/node/generated/test_gridsample_nearest_align_corners_1_additional_1",
     "onnx/node/generated/test_lstm_with_peepholes",
+    "onnx/node/generated/test_nonmaxsuppression_center_point_box_format",
     "onnx/node/generated/test_nonmaxsuppression_flipped_coordinates",
+    "onnx/node/generated/test_nonmaxsuppression_identical_boxes",
+    "onnx/node/generated/test_nonmaxsuppression_limit_output_size",
     "onnx/node/generated/test_nonmaxsuppression_single_box",
+    "onnx/node/generated/test_nonmaxsuppression_suppress_by_IOU",
     "onnx/node/generated/test_nonmaxsuppression_suppress_by_IOU_and_scores",
     "onnx/node/generated/test_pow",
     "onnx/node/generated/test_qlinearmatmul_2D_int8_float16",
diff --git a/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_vulkan.json b/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_vulkan_O0.json
similarity index 99%
rename from tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_vulkan.json
rename to tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_vulkan_O0.json
index bb2377e..6979cda 100644
--- a/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_vulkan.json
+++ b/tests/external/iree-test-suites/onnx_ops/onnx_ops_gpu_vulkan_O0.json
@@ -2,7 +2,8 @@
   "config_name": "gpu_vulkan",
   "iree_compile_flags": [
     "--iree-hal-target-device=vulkan",
-    "--iree-input-demote-f64-to-f32"
+    "--iree-input-demote-f64-to-f32",
+    "--iree-opt-level=O0"
   ],
   "iree_run_module_flags": [
     "--device=vulkan"