[CPU] Enable 'iree-llvmcpu-reassociate-fp-reductions' by default (#13685)

This PR enables fp reduction reassociation by default. When this flag is disabled, we are basically not vectorizing the reduction dimension at all, which results in extra unrolling of scalar instructions. It's difficult that an external user really understands the implications of this flag and that it has to be enabled to get some performance on fp reductions.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index fe8dd6f..e326b14 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -63,7 +63,7 @@
 static llvm::cl::opt<bool> clEnableReassociateFpReductions(
     "iree-llvmcpu-reassociate-fp-reductions",
     llvm::cl::desc("Enables reassociation for FP reductions"),
-    llvm::cl::init(false));
+    llvm::cl::init(true));
 
 static llvm::cl::opt<bool> clInstrumentMemoryAccesses{
     "iree-llvmcpu-instrument-memory-accesses",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index d26de94..4870f3a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -57,10 +57,11 @@
 }
 // CHECK-LABEL: func.func @check_no_cse()
 //   CHECK-NOT:    memref.alloc
-//       CHECK:    %[[FOR:.+]] = scf.for
-//       CHECK:    %[[DIVF:.+]] = arith.divf %[[FOR]]
-//       CHECK:    %[[RES:.+]] = vector.extract %[[DIVF]]
-//       CHECK:    memref.store %[[RES]]
+//       CHECK:    scf.for
+//       CHECK:      arith.addf
+//       CHECK:    vector.reduction <add>
+//       CHECK:    arith.divf
+//       CHECK:    memref.store
 
 // -----
 
@@ -545,8 +546,8 @@
 hal.executable private @ukernel_pass_through {
   hal.executable.variant public @embedded_elf_x86_64, target = <
     "llvm-cpu", "embedded-elf-x86_64", {
-      cpu = "generic", cpu_features = "", 
-      data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", 
+      cpu = "generic", cpu_features = "",
+      data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
       native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf",
       ukernels = false}> {
     hal.executable.export public @dispatch ordinal(0) layout(#hal.pipeline.layout<
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir
index 37afde9..1d2bdda 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target)))' --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target)))' --iree-llvmcpu-reassociate-fp-reductions=false --split-input-file %s | FileCheck %s
 // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target)))' --iree-llvmcpu-reassociate-fp-reductions=true --split-input-file %s | FileCheck %s --check-prefix=REORDERCHECK
 
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
index 3d60213..5e837f2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
@@ -124,9 +124,8 @@
 }
 
 //   CHECK-LABEL: func.func @mask_dynamic_reduction
-// CHECK-COUNT-5:   vector.maskedload
-// CHECK-COUNT-4:   vector.mask %{{.*}} { vector.reduction <add>
-//         CHECK:   vector.maskedstore
+//         CHECK:   vector.maskedload
+//         CHECK:   vector.mask %{{.*}} { vector.reduction <add>
 
 // -----
 
diff --git a/tests/e2e/regression/BUILD.bazel b/tests/e2e/regression/BUILD.bazel
index 9730796..b356af8 100644
--- a/tests/e2e/regression/BUILD.bazel
+++ b/tests/e2e/regression/BUILD.bazel
@@ -58,9 +58,21 @@
 )
 
 iree_check_single_backend_test_suite(
-    name = "check_regression_llvm-cpu",
+    name = "check_fp_reassoc_regression_llvm-cpu",
     srcs = [
         "associative_reordering.mlir",
+    ],
+    compiler_flags = [
+        "--iree-input-type=mhlo",
+        "--iree-llvmcpu-reassociate-fp-reductions=false",
+    ],
+    driver = "local-task",
+    target_backend = "llvm-cpu",
+)
+
+iree_check_single_backend_test_suite(
+    name = "check_regression_llvm-cpu",
+    srcs = [
         "layernorm.mlir",
         "lowering_config.mlir",
         "pack_pad_transpose_1x9_into_2x4x8x4_issue_12546.mlir",
diff --git a/tests/e2e/regression/CMakeLists.txt b/tests/e2e/regression/CMakeLists.txt
index 2154feb..9913fa6 100644
--- a/tests/e2e/regression/CMakeLists.txt
+++ b/tests/e2e/regression/CMakeLists.txt
@@ -33,9 +33,22 @@
 
 iree_check_single_backend_test_suite(
   NAME
-    check_regression_llvm-cpu
+    check_fp_reassoc_regression_llvm-cpu
   SRCS
     "associative_reordering.mlir"
+  TARGET_BACKEND
+    "llvm-cpu"
+  DRIVER
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-input-type=mhlo"
+    "--iree-llvmcpu-reassociate-fp-reductions=false"
+)
+
+iree_check_single_backend_test_suite(
+  NAME
+    check_regression_llvm-cpu
+  SRCS
     "dynamic_abs.mlir"
     "dynamic_add.mlir"
     "dynamic_dot.mlir"