Merge pull request #7934 from google/benvanik-buffer-hooks

Adding iree_hal_allocator_t::deallocate_buffer.
diff --git a/SUBMODULE_VERSIONS.txt b/SUBMODULE_VERSIONS.txt
index 1daeded..696af44 100644
--- a/SUBMODULE_VERSIONS.txt
+++ b/SUBMODULE_VERSIONS.txt
@@ -4,14 +4,14 @@
 aa533abfd4232b01f9e57041d70114d5a77e6de0 third_party/googletest
 88b845dee001723c4a0db1fe5477de735b6d3bb0 third_party/liburing
 f8f760f7387d2cc56a2fc7b1be313a3bf3f7f58c third_party/libyaml
-4f60a42878b0d46bc2cc84d8f0d316cac2c60c9d third_party/llvm-project
-cf097ee16b718cce7498747416772e1b3a7e9dc6 third_party/mlir-hlo
+a3ea9052d6a16b13607046df6a324403fb51888d third_party/llvm-project
+4d4adc2e0dd7368b1a1cad6d8ebd26f9476ecbf0 third_party/mlir-hlo
 3f701faace7addc75d16dea8a6cd769fa5b3f260 third_party/musl
 59aa99860c60bd171b9565e9920f125fdb749267 third_party/pybind11
 e9cc6403341baf0edd430a4027b074d0a06b782f third_party/spirv_cross
 d53b49635b7484e86959608a65a64d8121e6a385 third_party/spirv_headers
 af1a5bc352164740c1cc1354942b1c6b72eacb8a third_party/stblib
-ef0b7c51b6cd0caac025bfe671e0b767e3413468 third_party/tensorflow
+f435ae9dee673e83504618b77e1be8cddda73e74 third_party/tensorflow
 058e89011fceca912d43638ebb6b85992147fcfe third_party/tracy
 9e62d027636cd7210f60d934f56107ed6e1579b8 third_party/vulkan_headers
 5c8b3ba955f0dbb30d18afc420f3a38adc779231 third_party/vulkan_memory_allocator
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 5ac8fd1..55f5d63 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -60,3 +60,32 @@
 ### Other project metrics
 
 TODO(#6161): Collect metrics for miscellaneous IREE system states
+
+## Developer notes
+
+These are ad-hoc notes added for developers to help triage errors.
+
+### Repro of TFLite model errors
+
+These steps help reproduce the failures in TFLite models. 
+
+1.  Install `iree-import-tflite`.
+    ```
+    $ python -m pip install iree-tools-tflite-snapshot -f https://github.com/google/iree/releases
+    ```
+
+2. Confirm the binary `iree-import-tflite` is in your path by running
+    ```
+    $ iree-import-tflite --help
+    ```
+
+3. Download the TFLite flatbuffer for the failing benchmarks. The
+location can be found from [this CMakeLists.txt file](./TFLite/CMakeLists.txt)
+
+4. The input TOSA model can be generated by running
+    ```
+    $ iree-import-tflite <tflite file> -o <tosa output file>
+    ```
+
+5. The exact flags used to compile and run the benchmarks can be
+found in [this CMakeLists.txt file](./TFLite/CMakeLists.txt)
diff --git a/benchmarks/TFLite/CMakeLists.txt b/benchmarks/TFLite/CMakeLists.txt
index bd443a0..9702492 100644
--- a/benchmarks/TFLite/CMakeLists.txt
+++ b/benchmarks/TFLite/CMakeLists.txt
@@ -192,53 +192,55 @@
     "--task_topology_group_count=1"
 )
 
-iree_benchmark_suite(
-  MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
+# TODO(#7792): Re-enable these when we are able to run different benchmarks
+# depending on use-case (presubmit, postsubmit, nightly, etc.)
+# iree_benchmark_suite(
+#   MODULES
+#     "${DEEPLABV3_FP32_MODULE}"
+#     "${MOBILESSD_FP32_MODULE}"
+#     "${POSENET_FP32_MODULE}"
+#     "${MOBILEBERT_FP32_MODULE}"
+#     "${MOBILENET_V2_MODULE}"
+#     "${MOBILENET_V3SMALL_MODULE}"
 
-  BENCHMARK_MODES
-    "2-thread,big-core,full-inference,default-flags"
-    "2-thread,little-core,full-inference,default-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    ${ANDROID_CPU_TRANSLATION_FLAGS}
-  DRIVER
-    "dylib"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=2"
-)
+#   BENCHMARK_MODES
+#     "2-thread,big-core,full-inference,default-flags"
+#     "2-thread,little-core,full-inference,default-flags"
+#   TARGET_BACKEND
+#     "dylib-llvm-aot"
+#   TARGET_ARCHITECTURE
+#     "CPU-ARM64-v8A"
+#   TRANSLATION_FLAGS
+#     ${ANDROID_CPU_TRANSLATION_FLAGS}
+#   DRIVER
+#     "dylib"
+#   RUNTIME_FLAGS
+#     "--task_topology_group_count=2"
+# )
 
-iree_benchmark_suite(
-  MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
+# iree_benchmark_suite(
+#   MODULES
+#     "${DEEPLABV3_FP32_MODULE}"
+#     "${MOBILESSD_FP32_MODULE}"
+#     "${POSENET_FP32_MODULE}"
+#     "${MOBILEBERT_FP32_MODULE}"
+#     "${MOBILENET_V2_MODULE}"
+#     "${MOBILENET_V3SMALL_MODULE}"
 
-  BENCHMARK_MODES
-    "3-thread,big-core,full-inference,default-flags"
-    "3-thread,little-core,full-inference,default-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    ${ANDROID_CPU_TRANSLATION_FLAGS}
-  DRIVER
-    "dylib"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=3"
-)
+#   BENCHMARK_MODES
+#     "3-thread,big-core,full-inference,default-flags"
+#     "3-thread,little-core,full-inference,default-flags"
+#   TARGET_BACKEND
+#     "dylib-llvm-aot"
+#   TARGET_ARCHITECTURE
+#     "CPU-ARM64-v8A"
+#   TRANSLATION_FLAGS
+#     ${ANDROID_CPU_TRANSLATION_FLAGS}
+#   DRIVER
+#     "dylib"
+#   RUNTIME_FLAGS
+#     "--task_topology_group_count=3"
+# )
 
 iree_benchmark_suite(
   MODULES
@@ -369,6 +371,9 @@
     "dylib-sync"
 )
 
+# TODO(#7792): Consider re-enabling little-core experimental-flags if we start
+# optimizing for little cores or we can just run them occasionally
+
 # CPU, Dylib, 1 through 4 threads, big/little-core, full-inference.
 iree_benchmark_suite(
   MODULES
@@ -381,7 +386,7 @@
 
   BENCHMARK_MODES
     "1-thread,big-core,full-inference,experimental-flags"
-    "1-thread,little-core,full-inference,experimental-flags"
+    # "1-thread,little-core,full-inference,experimental-flags"
   TARGET_BACKEND
     "dylib-llvm-aot"
   TARGET_ARCHITECTURE
@@ -396,57 +401,59 @@
     "--task_topology_group_count=1"
 )
 
-iree_benchmark_suite(
-  MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
+# TODO(#7792): Re-enable these when we are able to run different benchmarks
+# depending on use-case (presubmit, postsubmit, nightly, etc.)
+# iree_benchmark_suite(
+#   MODULES
+#     "${DEEPLABV3_FP32_MODULE}"
+#     "${MOBILESSD_FP32_MODULE}"
+#     "${POSENET_FP32_MODULE}"
+#     "${MOBILEBERT_FP32_MODULE}"
+#     "${MOBILENET_V2_MODULE}"
+#     "${MOBILENET_V3SMALL_MODULE}"
 
-  BENCHMARK_MODES
-    "2-thread,big-core,full-inference,experimental-flags"
-    "2-thread,little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    ${ANDROID_CPU_TRANSLATION_FLAGS}
-    "--iree-flow-inline-constants-max-byte-length=2048"
-    "--iree-llvm-loop-unrolling=true"
-  DRIVER
-    "dylib"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=2"
-)
+#   BENCHMARK_MODES
+#     "2-thread,big-core,full-inference,experimental-flags"
+#     "2-thread,little-core,full-inference,experimental-flags"
+#   TARGET_BACKEND
+#     "dylib-llvm-aot"
+#   TARGET_ARCHITECTURE
+#     "CPU-ARM64-v8A"
+#   TRANSLATION_FLAGS
+#     ${ANDROID_CPU_TRANSLATION_FLAGS}
+#     "--iree-flow-inline-constants-max-byte-length=2048"
+#     "--iree-llvm-loop-unrolling=true"
+#   DRIVER
+#     "dylib"
+#   RUNTIME_FLAGS
+#     "--task_topology_group_count=2"
+# )
 
-iree_benchmark_suite(
-  MODULES
-  "${DEEPLABV3_FP32_MODULE}"
-  "${MOBILESSD_FP32_MODULE}"
-  "${POSENET_FP32_MODULE}"
-  "${MOBILEBERT_FP32_MODULE}"
-  "${MOBILENET_V2_MODULE}"
-  "${MOBILENET_V3SMALL_MODULE}"
+# iree_benchmark_suite(
+#   MODULES
+#   "${DEEPLABV3_FP32_MODULE}"
+#   "${MOBILESSD_FP32_MODULE}"
+#   "${POSENET_FP32_MODULE}"
+#   "${MOBILEBERT_FP32_MODULE}"
+#   "${MOBILENET_V2_MODULE}"
+#   "${MOBILENET_V3SMALL_MODULE}"
 
-  BENCHMARK_MODES
-    "3-thread,big-core,full-inference,experimental-flags"
-    "3-thread,little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    ${ANDROID_CPU_TRANSLATION_FLAGS}
-    "--iree-flow-inline-constants-max-byte-length=2048"
-    "--iree-llvm-loop-unrolling=true"
-  DRIVER
-    "dylib"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=3"
-)
+#   BENCHMARK_MODES
+#     "3-thread,big-core,full-inference,experimental-flags"
+#     "3-thread,little-core,full-inference,experimental-flags"
+#   TARGET_BACKEND
+#     "dylib-llvm-aot"
+#   TARGET_ARCHITECTURE
+#     "CPU-ARM64-v8A"
+#   TRANSLATION_FLAGS
+#     ${ANDROID_CPU_TRANSLATION_FLAGS}
+#     "--iree-flow-inline-constants-max-byte-length=2048"
+#     "--iree-llvm-loop-unrolling=true"
+#   DRIVER
+#     "dylib"
+#   RUNTIME_FLAGS
+#     "--task_topology_group_count=3"
+# )
 
 iree_benchmark_suite(
   MODULES
@@ -459,7 +466,7 @@
 
   BENCHMARK_MODES
     "4-thread,big-core,full-inference,experimental-flags"
-    "4-thread,little-core,full-inference,experimental-flags"
+    # "4-thread,little-core,full-inference,experimental-flags"
   TARGET_BACKEND
     "dylib-llvm-aot"
   TARGET_ARCHITECTURE
diff --git a/iree/compiler/Dialect/Flow/Transforms/Passes.cpp b/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
index c40e2ed..f1a7f30 100644
--- a/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
+++ b/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
@@ -123,6 +123,7 @@
       // Input should now be legal.
       .addPass(createVerifyInputLegalityPass);
 
+  passManager.addPass(mlir::createLinalgNamedOpConversionPass());
   buildGlobalOptimizationPassPipeline(passManager, transformOptions);
 
   // Perform cleanup after variable simplification as more canonicalizers may be
@@ -143,7 +144,6 @@
       .addPass(mlir::createCSEPass)
       .addPredicatedPass(clEnableLinalgDetensorize,
                          mlir::createLinalgDetensorizePass)
-
       // Dispatch region formation.
       .addPass(createConvertToFlowBeforeDispatchFormation)
       .addPass(mlir::createCanonicalizerPass)
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 4f60a42..a3ea905 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 4f60a42878b0d46bc2cc84d8f0d316cac2c60c9d
+Subproject commit a3ea9052d6a16b13607046df6a324403fb51888d
diff --git a/third_party/mlir-hlo b/third_party/mlir-hlo
index cf097ee..4d4adc2 160000
--- a/third_party/mlir-hlo
+++ b/third_party/mlir-hlo
@@ -1 +1 @@
-Subproject commit cf097ee16b718cce7498747416772e1b3a7e9dc6
+Subproject commit 4d4adc2e0dd7368b1a1cad6d8ebd26f9476ecbf0
diff --git a/third_party/tensorflow b/third_party/tensorflow
index ef0b7c5..f435ae9 160000
--- a/third_party/tensorflow
+++ b/third_party/tensorflow
@@ -1 +1 @@
-Subproject commit ef0b7c51b6cd0caac025bfe671e0b767e3413468
+Subproject commit f435ae9dee673e83504618b77e1be8cddda73e74