Enable LTO optimization by default for runtime releases. (#16811)

This is done by generalizing the primordial `IREE_SIZE_OPTIMIZED` flag
into a `IREE_RUNTIME_OPTIMIZATION_PROFILE` that:

* Can enable 'lto' or 'size'.
* Is scoped to just the runtime targets.
* Minimally does the right thing for 'size' on Linux vs just on Windows
(not the goal of this patch but drops ~300KB from binary sizes when
enabled).

The compile time delta for a clean build of the runtime in full LTO vs
regular mode was not measured precisely but is in the noise (i.e. <1m).
As such, just enabling by default for Python release binaries.

Others can be enabled via: `-DIREE_RUNTIME_OPTIMIZATION_PROFILE=lto`,
which is recommended for benchmarking, etc.

Note that this removes the use of the CMake option
`IREE_SIZE_OPTIMIZED`. It was never even declared properly as an option
and didn't do the same class of thing across Windows/Linux. This has
been fixed and it can be enabled via
`-DIREE_RUNTIME_OPTIMIZATION_PROFILE=size`. Note that as on Windows,
this implies LTO. If old behavior without LTO is desired, we can add a
profile for that.

Progress on #898.

---------

Co-authored-by: Scott Todd <scott.todd0@gmail.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7603dd5..6bd7261 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -123,6 +123,11 @@
 # CI coverage is established.
 option(BUILD_SHARED_LIBS "Instructs CMake to build libraries as shared if possible" OFF)
 
+# Control of LTO settings for the runtime build.
+set(IREE_RUNTIME_OPTIMIZATION_PROFILE "" CACHE STRING 
+    "Build optimization profile to apply. One of '', 'lto', 'size'.")
+set(IREE_LTO_MODE "full" CACHE STRING "LTO type, 'thin' or 'full'. Only consulted on clang-like compilers.")
+
 #-------------------------------------------------------------------------------
 # IREE command-line tooling configuration
 #-------------------------------------------------------------------------------
diff --git a/build_tools/cmake/build_runtime.sh b/build_tools/cmake/build_runtime.sh
index 7db3542..4fb41ff 100755
--- a/build_tools/cmake/build_runtime.sh
+++ b/build_tools/cmake/build_runtime.sh
@@ -28,6 +28,8 @@
   "-DPython3_EXECUTABLE=${IREE_PYTHON3_EXECUTABLE}"
   "-DPYTHON_EXECUTABLE=${IREE_PYTHON3_EXECUTABLE}"
   "-DCMAKE_BUILD_TYPE=RelWithDebInfo"
+  "-DIREE_RUNTIME_OPTIMIZATION_PROFILE=lto"
+  "-DIREE_FORCE_LTO_COMPAT_BINUTILS_ON_LINUX=ON"
   "-DIREE_BUILD_COMPILER=OFF"
 )
 
diff --git a/build_tools/cmake/build_runtime_small.sh b/build_tools/cmake/build_runtime_small.sh
index 88cb2c9..eb456e2 100755
--- a/build_tools/cmake/build_runtime_small.sh
+++ b/build_tools/cmake/build_runtime_small.sh
@@ -22,5 +22,7 @@
   -DPYTHON_EXECUTABLE="${IREE_PYTHON3_EXECUTABLE}" \
   -DCMAKE_BUILD_TYPE=MinSizeRel \
   -DIREE_SIZE_OPTIMIZED=ON \
+  -DIREE_FORCE_LTO_COMPAT_BINUTILS_ON_LINUX=size \
+  -DIREE_FORCE_GCC_BINUTILS_ON_LINUX=ON \
   -DIREE_BUILD_COMPILER=OFF
 "${CMAKE_BIN?}" --build "${BUILD_DIR}" -- -k 0
diff --git a/build_tools/cmake/external_cc_library.cmake b/build_tools/cmake/external_cc_library.cmake
index 02ae7ff..a49ca65 100644
--- a/build_tools/cmake/external_cc_library.cmake
+++ b/build_tools/cmake/external_cc_library.cmake
@@ -134,6 +134,8 @@
       PRIVATE
         ${_RULE_COPTS}
         ${IREE_DEFAULT_COPTS}
+      INTERFACE
+        ${IREE_INTERFACE_COPTS}
     )
     target_link_options(${_NAME}
       PRIVATE
diff --git a/build_tools/cmake/iree_cc_binary.cmake b/build_tools/cmake/iree_cc_binary.cmake
index 2e8af2f..c1b6025 100644
--- a/build_tools/cmake/iree_cc_binary.cmake
+++ b/build_tools/cmake/iree_cc_binary.cmake
@@ -124,6 +124,7 @@
   target_compile_options(${_NAME}
     PRIVATE
       ${IREE_DEFAULT_COPTS}
+      ${IREE_INTERFACE_COPTS}
       ${_RULE_COPTS}
   )
   target_link_options(${_NAME}
diff --git a/build_tools/cmake/iree_cc_library.cmake b/build_tools/cmake/iree_cc_library.cmake
index a9cc1e9..980f8ff 100644
--- a/build_tools/cmake/iree_cc_library.cmake
+++ b/build_tools/cmake/iree_cc_library.cmake
@@ -230,6 +230,8 @@
       PRIVATE
         ${IREE_DEFAULT_COPTS}
         ${_RULE_COPTS}
+      INTERFACE
+        ${IREE_INTERFACE_COPTS}
     )
     target_link_options(${_NAME}
       PRIVATE
diff --git a/build_tools/cmake/iree_copts.cmake b/build_tools/cmake/iree_copts.cmake
index 10342b4..0f792e1 100644
--- a/build_tools/cmake/iree_copts.cmake
+++ b/build_tools/cmake/iree_copts.cmake
@@ -415,48 +415,109 @@
 endif()
 
 #-------------------------------------------------------------------------------
-# Size-optimized build flags
+# Flag sets used different optimization profiles.
 #-------------------------------------------------------------------------------
 
-# TODO(#898): add a dedicated size-constrained configuration.
-if(IREE_SIZE_OPTIMIZED)
-  iree_select_compiler_opts(IREE_SIZE_OPTIMIZED_DEFAULT_COPTS
-    MSVC_OR_CLANG_CL
-      "/GS-"
-      "/GL"
-      "/Gw"
-      "/Gy"
-      "/DNDEBUG"
-      "/Os"
-      "/Oy"
-      "/Zi"
-      "/c"
+iree_select_compiler_opts(IREE_LTO_COPTS
+  CLANG
+    "-flto=${IREE_LTO_MODE}"
+  GCC
+    "-flto"
+    "-fuse-linker-plugin"
+  MSVC_OR_CLANG_CL
+    "/GL"
+)
+
+iree_select_compiler_opts(IREE_LTO_LINKOPTS
+  CLANG
+    "-flto=${IREE_LTO_MODE}"
+  GCC
+    "-flto"
+  MSVC_OR_CLANG_CL
+    "-LTCG"
+)
+
+iree_select_compiler_opts(IREE_SIZE_OPTIMIZED_DEFAULT_COPTS
+  MSVC_OR_CLANG_CL
+    "/GS-"
+    "/Gw"
+    "/Gy"
+    "/DNDEBUG"
+    "/Os"
+    "/Oy"
+    "/Zi"
+    "/c"
+)
+iree_select_compiler_opts(IREE_SIZE_OPTIMIZED_DEFAULT_LINKOPTS
+  MSVC_OR_CLANG_CL
+    "-DEBUG:FULL"
+    "-opt:ref,icf"
+)
+
+# Function which enables various optimization options for a sub-tree by
+# modifying the IREE_DEFAULT_COPTS and IREE_DEFAULT_LINKOPTS that targets
+# created after this point use.
+#
+# Available profiles:
+#   "lto": Applies options to enable link time code generation.
+#   "size": Applies a variety of options to minimize the size of the runtime,
+#     generally at the expense of features but not performance. This implies
+#     LTO.
+#
+# Parameters:
+# PROFILE_NAME: Name of a supported profile or falsey for none.
+# SIZE_INTERFACE_COPTS: Additional IREE_INTERFACE_COPTS to add for the
+#   "size" profile.
+function(iree_enable_optimization_options)
+  cmake_parse_arguments(
+    _RULE
+    ""
+    "PROFILE_NAME"
+    "SIZE_INTERFACE_COPTS"
+    ${ARGN}
   )
-  iree_select_compiler_opts(IREE_SIZE_OPTIMIZED_DEFAULT_LINKOPTS
-    MSVC_OR_CLANG_CL
-      "-DEBUG:FULL"
-      "-LTCG"
-      "-opt:ref,icf"
+
+  if(NOT _RULE_PROFILE_NAME)
+    # Do nothing.
+    return()
+  endif()
+
+  set(_ADDL_COPTS)
+  set(_ADDL_INTERFACE_COPTS)
+  set(_ADDL_LINKOPTS)
+
+  if(_RULE_PROFILE_NAME STREQUAL "lto")
+    set(_ADDL_COPTS ${IREE_LTO_COPTS})
+    set(_ADDL_LINKOPTS ${IREE_LTO_LINKOPTS})
+  elseif(_RULE_PROFILE_NAME STREQUAL "size")
+    # Size optimized assumes LTO.
+    # Size optimized often also elides logging and various status reporting,
+    # which can result in unused-but-set-variable style warnings. Disable those.
+    iree_select_compiler_opts(_ADDL_COPTS
+      ALL
+        ${IREE_LTO_COPTS}
+        ${IREE_SIZE_OPTIMIZED_DEFAULT_COPTS}
+      CLANG_OR_GCC
+        -Wno-unused-but-set-variable
+    )
+    set(_ADDL_INTERFACE_COPTS "${_RULE_SIZE_INTERFACE_COPTS}")
+    set(_ADDL_LINKOPTS 
+      ${IREE_LTO_LINKOPTS}
+      ${IREE_SIZE_OPTIMIZED_DEFAULT_LINKOPTS}
+    )
+  else()
+    message(FATAL_ERROR "Unrecognized size optimization profile name '${_RULE_PROFILE_NAME}'. Expected one of 'lto', 'size'")
+  endif()
+
+  message(STATUS "Enabled optimization profile '${_RULE_PROFILE_NAME}' for targets under ${CMAKE_CURRENT_SOURCE_DIR}: \n"
+    "      COPTS: ${_ADDL_COPTS}\n"
+    "      INTERFACE COPTS: ${_ADDL_INTERFACE_COPTS}\n"
+    "      LINKOPTS: ${_ADDL_LINKOPTS}"
   )
-  # TODO(#898): make this only impact the runtime (IREE_RUNTIME_DEFAULT_...).
-  # These flags come from iree/base/config.h:
-  set(IREE_DEFAULT_COPTS
-      "${IREE_DEFAULT_COPTS}"
-      "${IREE_SIZE_OPTIMIZED_DEFAULT_COPTS}"
-      "-DIREE_STATUS_MODE=0"
-      "-DIREE_STATISTICS_ENABLE=0"
-      "-DIREE_HAL_MODULE_STRING_UTIL_ENABLE=0"
-      "-DIREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE=0"
-      "-DIREE_VM_BACKTRACE_ENABLE=0"
-      "-DIREE_VM_BYTECODE_VERIFICATION_ENABLE=0"
-      "-DIREE_VM_EXT_F32_ENABLE=0"
-      "-DIREE_VM_EXT_F64_ENABLE=0"
-  )
-  set(IREE_DEFAULT_LINKOPTS
-      "${IREE_DEFAULT_LINKOPTS}"
-      "${IREE_SIZE_OPTIMIZED_DEFAULT_LINKOPTS}"
-  )
-endif()
+  set(IREE_DEFAULT_COPTS "${IREE_DEFAULT_COPTS};${_ADDL_COPTS}" PARENT_SCOPE)
+  set(IREE_INTERFACE_COPTS "${IREE_INTERFACE_COPTS};${_ADDL_INTERFACE_COPTS}" PARENT_SCOPE)
+  set(IREE_DEFAULT_LINKOPTS "${IREE_DEFAULT_LINKOPTS};${_ADDL_LINKOPTS}" PARENT_SCOPE)
+endfunction()
 
 #-------------------------------------------------------------------------------
 # Compiler: Clang/LLVM
diff --git a/build_tools/cmake/iree_setup_toolchain.cmake b/build_tools/cmake/iree_setup_toolchain.cmake
index d046ddd..a38aef0 100644
--- a/build_tools/cmake/iree_setup_toolchain.cmake
+++ b/build_tools/cmake/iree_setup_toolchain.cmake
@@ -22,6 +22,49 @@
 # explicitly or through global properties. Please don't add to it without
 # a very good reason.
 macro(iree_setup_toolchain)
+  #-------------------------------------------------------------------------------
+  # Force LTO compatible tools.
+  #-------------------------------------------------------------------------------
+
+  # On older (i.e. gcc 9.x era) systems, the compiler and system toolchains
+  # were not compatible for general LTO use, and they were further not
+  # compatible amongst themselves.
+  # As an aid to CIs, we provide an option which will force toolchain specific
+  # binutils and linkers only if running on Linux. This lets us use the same 
+  # runtime build scripts across platforms without further shenanigans. 
+  # This is a hack and should be rolled back once 2020 era systems are not in 
+  # use.
+  # Users should not use this. If they have such an old system, configure CMake
+  # to use toolchain specific tools.
+  option(IREE_FORCE_LTO_COMPAT_BINUTILS_ON_LINUX "Forces use of toolchain specific LTO compatible binutils if on Linux" OFF)
+  mark_as_advanced(IREE_FORCE_LTO_COMPAT_BINUTILS_ON_LINUX)
+  if(IREE_FORCE_LTO_COMPAT_BINUTILS_ON_LINUX AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+      message(STATUS "Running on an old Linux with -DIREE_FORCE_LTO_COMPAT_BINUTILS_ON_LINUX: Forcing llvm-ar, llvm-nm, llvm-ranlib, and ld.lld")
+      find_program(IREE_CMAKE_LTO_AR llvm-ar REQUIRED)
+      find_program(IREE_CMAKE_LTO_RANLIB llvm-ranlib REQUIRED)
+      find_program(IREE_CMAKE_LTO_NM llvm-nm REQUIRED)
+      set(IREE_USE_LINKER "lld")  
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+      message(STATUS "Running on an old Linux with -DIREE_FORCE_LTO_COMPAT_BINUTILS_ON_LINUX: Forcing gcc-ar, gcc-nm, gcc-ranlib, and ld.gold")
+      find_program(IREE_CMAKE_LTO_AR gcc-ar REQUIRED)
+      find_program(IREE_CMAKE_LTO_RANLIB gcc-ranlib REQUIRED)
+      find_program(IREE_CMAKE_LTO_NM gcc-nm REQUIRED)
+      set(IREE_USE_LINKER "gold")
+    endif()
+
+    set(IREE_ENABLE_LLD OFF)
+    find_program(IREE_CMAKE_LTO_LD ld.${IREE_USE_LINKER} REQUIRED)
+    mark_as_advanced(IREE_CMAKE_LTO_AR IREE_CMAKE_LTO_RANLIB IREE_CMAKE_LTO_NM IREE_CMAKE_LTO_LD)
+
+    set(CMAKE_AR ${IREE_CMAKE_LTO_AR} CACHE FILEPATH "Forcing LTO ar instead of ar" FORCE)
+    set(CMAKE_AR ${IREE_CMAKE_LTO_AR})
+    set(CMAKE_NM ${IREE_CMAKE_LTO_NM} CACHE FILEPATH "Forcing LTO nm instead of nm" FORCE)
+    set(CMAKE_NM ${IREE_CMAKE_LTO_NM})
+    set(CMAKE_RANLIB ${IREE_CMAKE_LTO_RANLIB} CACHE FILEPATH "Forcing LTO ranlib instead of ranlib" FORCE)
+    set(CMAKE_RANLIB ${IREE_CMAKE_LTO_RANLIB})
+  endif()
+
   #-----------------------------------------------------------------------------
   # Supports dynamic library loading.
   #-----------------------------------------------------------------------------
diff --git a/experimental/regression_suite/external_test_suite/config_cpu_llvm_sync.json b/experimental/regression_suite/external_test_suite/config_cpu_llvm_sync.json
index 946c152..d578270 100644
--- a/experimental/regression_suite/external_test_suite/config_cpu_llvm_sync.json
+++ b/experimental/regression_suite/external_test_suite/config_cpu_llvm_sync.json
@@ -794,9 +794,11 @@
     "test_clip_default_int8_min",
     "test_clip_default_int8_min_expanded",
     "test_constant_pad",
+    "test_constantofshape_float_ones",
     "test_constantofshape_int_shape_zero",
     "test_constantofshape_int_zeros",
     "test_div_uint8",
+    "test_dropout_default_mask_ratio",
     "test_elu_default",
     "test_gather_0",
     "test_gather_1",
@@ -834,13 +836,14 @@
     "test_pow_types_float32_uint64",
     "test_qlinearconv",
     "test_qlinearmatmul_2D_int8_float16",
+    "test_qlinearmatmul_2D_int8_float32",
     "test_qlinearmatmul_3D_int8_float16",
     "test_qlinearmatmul_3D_int8_float32",
     "test_qlinearmatmul_3D_uint8_float16",
-    "test_qlinearmatmul_2D_int8_float32",
     "test_qlinearmatmul_3D_uint8_float32",
     "test_quantizelinear",
     "test_range_int32_type_negative_delta",
+    "test_reduce_min_empty_set",
     "test_scatter_elements_with_negative_indices",
     "test_selu_default",
     "test_shape",
diff --git a/experimental/regression_suite/external_test_suite/config_gpu_vulkan.json b/experimental/regression_suite/external_test_suite/config_gpu_vulkan.json
index e0ceda7..2b21bef 100644
--- a/experimental/regression_suite/external_test_suite/config_gpu_vulkan.json
+++ b/experimental/regression_suite/external_test_suite/config_gpu_vulkan.json
@@ -795,15 +795,17 @@
     "test_castlike_FLOAT_to_BFLOAT16_expanded",
     "test_castlike_FLOAT_to_DOUBLE",
     "test_castlike_FLOAT_to_DOUBLE_expanded",
-    "test_clip_default_int8_min",
-    "test_clip_default_int8_min_expanded",
     "test_clip_default_int8_inbounds",
     "test_clip_default_int8_max",
     "test_clip_default_int8_max_expanded",
+    "test_clip_default_int8_min",
+    "test_clip_default_int8_min_expanded",
     "test_constant_pad",
+    "test_constantofshape_float_ones",
     "test_constantofshape_int_shape_zero",
     "test_constantofshape_int_zeros",
     "test_div_uint8",
+    "test_dropout_default_mask_ratio",
     "test_elu_default",
     "test_gather_0",
     "test_gather_1",
@@ -840,12 +842,12 @@
     "test_pow_types_float32_uint32",
     "test_pow_types_float32_uint64",
     "test_qlinearconv",
+    "test_qlinearmatmul_2D_int8_float16",
+    "test_qlinearmatmul_2D_int8_float32",
     "test_qlinearmatmul_3D_int8_float16",
     "test_qlinearmatmul_3D_int8_float32",
     "test_qlinearmatmul_3D_uint8_float16",
     "test_qlinearmatmul_3D_uint8_float32",
-    "test_qlinearmatmul_2D_int8_float16",
-    "test_qlinearmatmul_2D_int8_float32",
     "test_quantizelinear",
     "test_range_int32_type_negative_delta",
     "test_scatter_elements_with_negative_indices",
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 8ee250b..4d48d72 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -4,6 +4,22 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+iree_enable_optimization_options(
+  PROFILE_NAME
+    "${IREE_RUNTIME_OPTIMIZATION_PROFILE}"
+  # TODO: These options should be separated between those required as
+  # INTERFACE and those that can be private (i.e. to the runtime).
+  SIZE_INTERFACE_COPTS
+    "-DIREE_STATUS_MODE=0"
+    "-DIREE_STATISTICS_ENABLE=0"
+    "-DIREE_HAL_MODULE_STRING_UTIL_ENABLE=0"
+    "-DIREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE=0"
+    "-DIREE_VM_BACKTRACE_ENABLE=0"
+    "-DIREE_VM_BYTECODE_VERIFICATION_ENABLE=0"
+    "-DIREE_VM_EXT_F32_ENABLE=0"
+    "-DIREE_VM_EXT_F64_ENABLE=0"  
+)
+
 # Must include runtime plugins before processing the runtime sources so that
 # the static link list can be set.
 iree_include_cmake_plugin_dirs(
diff --git a/runtime/setup.py b/runtime/setup.py
index cf77f43..cbaee8e 100644
--- a/runtime/setup.py
+++ b/runtime/setup.py
@@ -72,6 +72,10 @@
         "*** Tracy tools not enabled (enable with IREE_RUNTIME_BUILD_TRACY_TOOLS=ON)",
         file=sys.stderr,
     )
+# Default to LTO builds for our python releases.
+IREE_RUNTIME_OPTIMIZATION_PROFILE = os.getenv(
+    "IREE_RUNTIME_OPTIMIZATION_PROFILE", "lto"
+)
 
 
 def check_pip_version():
@@ -264,6 +268,7 @@
     cmake_args = [
         "-GNinja",
         "--log-level=VERBOSE",
+        f"-DIREE_RUNTIME_OPTIMIZATION_PROFILE={IREE_RUNTIME_OPTIMIZATION_PROFILE}",
         "-DIREE_BUILD_PYTHON_BINDINGS=ON",
         "-DIREE_BUILD_COMPILER=OFF",
         "-DIREE_BUILD_SAMPLES=OFF",