Add a test that compiles softmax under aggressive fusion. (#11362)

All the necessary changes have landed to enable aggressive fusion on a softmax input. This compiles, but results in a stack allocation that is more than the allowed bounds. For now this test is added with the bound check relaxed, but that will need to be addressed as well.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUCheckIRBeforeLLVMConversion.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUCheckIRBeforeLLVMConversion.cpp
index 71f6369..c26b2a9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUCheckIRBeforeLLVMConversion.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUCheckIRBeforeLLVMConversion.cpp
@@ -17,8 +17,8 @@
     "iree-llvmcpu-stack-allocation-limit",
     llvm::cl::desc("maximum allowed stack allocation size in bytes"),
     llvm::cl::init(32768));
-static llvm::cl::opt<bool> clFailUnboundDynamicStackAllocation(
-    "iree-llvmcpu-fail-unbound-dynamic-stack-allocation",
+static llvm::cl::opt<bool> clFailOnOutOfBoundsStackAllocation(
+    "iree-llvmcpu-fail-on-out-of-bounds-stack-allocation",
     llvm::cl::desc("fail if the upper bound of dynamic stack allocation cannot "
                    "be solved"),
     llvm::cl::init(true));
@@ -45,7 +45,7 @@
       auto ub = linalg::getConstantUpperBoundForIndex(operand);
       if (succeeded(ub)) {
         size *= *ub;
-      } else if (clFailUnboundDynamicStackAllocation) {
+      } else if (clFailOnOutOfBoundsStackAllocation) {
         return allocaOp.emitOpError(
             "expected no stack allocations without upper bound shapes");
       }
@@ -62,7 +62,8 @@
     return signalPassFailure();
   }
   int maxAllocationSizeInBits = clMaxAllocationSizeInBytes * 8;
-  if (totalBits > maxAllocationSizeInBits) {
+  if (clFailOnOutOfBoundsStackAllocation &&
+      totalBits > maxAllocationSizeInBits) {
     moduleOp.emitOpError(
         "expected total size of stack allocation is not greater than ")
         << clMaxAllocationSizeInBytes.getValue() << " bytes, but got "
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/check_ir_before_llvm_conversion_not_fail_unbound.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/check_ir_before_llvm_conversion_not_fail_unbound.mlir
index 78d88e6..adae2c3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/check_ir_before_llvm_conversion_not_fail_unbound.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/check_ir_before_llvm_conversion_not_fail_unbound.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --iree-llvmcpu-check-ir-before-llvm-conversion --iree-llvmcpu-fail-unbound-dynamic-stack-allocation=false %s --verify-diagnostics -split-input-file
+// RUN: iree-opt --iree-llvmcpu-check-ir-before-llvm-conversion --iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false %s --verify-diagnostics -split-input-file
 
 module {
   func.func @dynamic_allocas(%arg0: index) {
@@ -6,11 +6,11 @@
     return
   }
 }
+// CHECK-LABEL: func @dynamic_allocas(
 
 // -----
 
 #map = affine_map<(d0) -> (-d0, 16384)>
-// expected-error @+1 {{expected total size of stack allocation is not greater than 32768 bytes, but got 65536 bytes}}
 module {
   func.func @dynamic_big_allocas(%arg0: index, %arg1: index) {
     %0 = affine.min #map(%arg0)
@@ -18,13 +18,14 @@
     return
   }
 }
+// CHECK-LABEL: func @dynamic_big_allocas(
 
 // -----
 
-// expected-error @+1 {{expected total size of stack allocation is not greater than 32768 bytes, but got 65536 bytes}}
 module {
   func.func @mix_static_and_unbound_dynamic_allocas(%arg0: index) {
     %0 = memref.alloca(%arg0) : memref<?x16384xf32>
     return
   }
 }
+// CHECK-LABEL: func @mix_static_and_unbound_dynamic_allocas(
\ No newline at end of file
diff --git a/tests/e2e/regression/BUILD b/tests/e2e/regression/BUILD
index 159c45f..6c1ecf6 100644
--- a/tests/e2e/regression/BUILD
+++ b/tests/e2e/regression/BUILD
@@ -144,3 +144,16 @@
     driver = "local-task",
     target_backend = "llvm-cpu",
 )
+
+iree_check_single_backend_test_suite(
+    name = "aggressive_fusion_test",
+    srcs = [
+        "softmax.mlir",
+    ],
+    compiler_flags = [
+        "--iree-flow-enable-aggressive-fusion",
+        "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false",
+    ],
+    driver = "local-task",
+    target_backend = "llvm-cpu",
+)
diff --git a/tests/e2e/regression/CMakeLists.txt b/tests/e2e/regression/CMakeLists.txt
index 0c06be4..df1a0d4 100644
--- a/tests/e2e/regression/CMakeLists.txt
+++ b/tests/e2e/regression/CMakeLists.txt
@@ -173,4 +173,18 @@
     "-iree-flow-demote-f64-to-f32=false"
 )
 
+iree_check_single_backend_test_suite(
+  NAME
+    aggressive_fusion_test
+  SRCS
+    "softmax.mlir"
+  TARGET_BACKEND
+    "llvm-cpu"
+  DRIVER
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-flow-enable-aggressive-fusion"
+    "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false"
+)
+
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###