[CPU] Enable 'iree-llvmcpu-reassociate-fp-reductions' by default (#13685) This PR enables fp reduction reassociation by default. When this flag is disabled, we are basically not vectorizing the reduction dimension at all, which results in extra unrolling of scalar instructions. It's difficult that an external user really understands the implications of this flag and that it has to be enabled to get some performance on fp reductions.

commit: 55a27809defb75eb356bdfa200d452358dcb43e4 [log] [tgz]
author: Diego Caballero <diegocaballero@google.com> Fri May 19 10:08:09 2023 -0700
committer: GitHub <noreply@github.com> Fri May 19 10:08:09 2023 -0700
tree: 0945c1dcf652771b8c012ab718acd1d1c485ee86
parent: 90ed2d0277cc153f53e60ba83aae306c4dbb28bc [diff]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index fe8dd6f..e326b14 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp

@@ -63,7 +63,7 @@
 static llvm::cl::opt<bool> clEnableReassociateFpReductions(
     "iree-llvmcpu-reassociate-fp-reductions",
     llvm::cl::desc("Enables reassociation for FP reductions"),
-    llvm::cl::init(false));
+    llvm::cl::init(true));
 
 static llvm::cl::opt<bool> clInstrumentMemoryAccesses{
     "iree-llvmcpu-instrument-memory-accesses",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index d26de94..4870f3a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir

@@ -57,10 +57,11 @@
 }
 // CHECK-LABEL: func.func @check_no_cse()
 //   CHECK-NOT:    memref.alloc
-//       CHECK:    %[[FOR:.+]] = scf.for
-//       CHECK:    %[[DIVF:.+]] = arith.divf %[[FOR]]
-//       CHECK:    %[[RES:.+]] = vector.extract %[[DIVF]]
-//       CHECK:    memref.store %[[RES]]
+//       CHECK:    scf.for
+//       CHECK:      arith.addf
+//       CHECK:    vector.reduction <add>
+//       CHECK:    arith.divf
+//       CHECK:    memref.store
 
 // -----
 
@@ -545,8 +546,8 @@
 hal.executable private @ukernel_pass_through {
   hal.executable.variant public @embedded_elf_x86_64, target = <
     "llvm-cpu", "embedded-elf-x86_64", {
-      cpu = "generic", cpu_features = "", 
-      data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", 
+      cpu = "generic", cpu_features = "",
+      data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
       native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf",
       ukernels = false}> {
     hal.executable.export public @dispatch ordinal(0) layout(#hal.pipeline.layout<

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir
index 37afde9..1d2bdda 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target)))' --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target)))' --iree-llvmcpu-reassociate-fp-reductions=false --split-input-file %s | FileCheck %s
 // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target)))' --iree-llvmcpu-reassociate-fp-reductions=true --split-input-file %s | FileCheck %s --check-prefix=REORDERCHECK
 
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
index 3d60213..5e837f2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir

@@ -124,9 +124,8 @@
 }
 
 //   CHECK-LABEL: func.func @mask_dynamic_reduction
-// CHECK-COUNT-5:   vector.maskedload
-// CHECK-COUNT-4:   vector.mask %{{.*}} { vector.reduction <add>
-//         CHECK:   vector.maskedstore
+//         CHECK:   vector.maskedload
+//         CHECK:   vector.mask %{{.*}} { vector.reduction <add>
 
 // -----
 

diff --git a/tests/e2e/regression/BUILD.bazel b/tests/e2e/regression/BUILD.bazel
index 9730796..b356af8 100644
--- a/tests/e2e/regression/BUILD.bazel
+++ b/tests/e2e/regression/BUILD.bazel

@@ -58,9 +58,21 @@
 )
 
 iree_check_single_backend_test_suite(
-    name = "check_regression_llvm-cpu",
+    name = "check_fp_reassoc_regression_llvm-cpu",
     srcs = [
         "associative_reordering.mlir",
+    ],
+    compiler_flags = [
+        "--iree-input-type=mhlo",
+        "--iree-llvmcpu-reassociate-fp-reductions=false",
+    ],
+    driver = "local-task",
+    target_backend = "llvm-cpu",
+)
+
+iree_check_single_backend_test_suite(
+    name = "check_regression_llvm-cpu",
+    srcs = [
         "layernorm.mlir",
         "lowering_config.mlir",
         "pack_pad_transpose_1x9_into_2x4x8x4_issue_12546.mlir",

diff --git a/tests/e2e/regression/CMakeLists.txt b/tests/e2e/regression/CMakeLists.txt
index 2154feb..9913fa6 100644
--- a/tests/e2e/regression/CMakeLists.txt
+++ b/tests/e2e/regression/CMakeLists.txt

@@ -33,9 +33,22 @@
 
 iree_check_single_backend_test_suite(
   NAME
-    check_regression_llvm-cpu
+    check_fp_reassoc_regression_llvm-cpu
   SRCS
     "associative_reordering.mlir"
+  TARGET_BACKEND
+    "llvm-cpu"
+  DRIVER
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-input-type=mhlo"
+    "--iree-llvmcpu-reassociate-fp-reductions=false"
+)
+
+iree_check_single_backend_test_suite(
+  NAME
+    check_regression_llvm-cpu
+  SRCS
     "dynamic_abs.mlir"
     "dynamic_add.mlir"
     "dynamic_dot.mlir"
commit	55a27809defb75eb356bdfa200d452358dcb43e4	[log] [tgz]
author	Diego Caballero <diegocaballero@google.com>	Fri May 19 10:08:09 2023 -0700
committer	GitHub <noreply@github.com>	Fri May 19 10:08:09 2023 -0700
tree	0945c1dcf652771b8c012ab718acd1d1c485ee86
parent	90ed2d0277cc153f53e60ba83aae306c4dbb28bc [diff]