Workaround Vulkan driver bug triggered by native allocator code. (#14790)
Workaround for https://github.com/openxla/iree/issues/14776
I haven't quite reached the root cause here, but I think we're hitting
https://gitlab.freedesktop.org/mesa/mesa/-/issues/9251 and a
particularly bad failure mode from the driver. When we try importing
from memory that is already mapped, allocations fail - both the current
allocation and future allocations.
More debugging notes are [here on
Discord](https://discord.com/channels/689900678990135345/689959648501039106/1143574510847676436).
diff --git a/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc b/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc
index 58c210c..2114c5b 100644
--- a/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc
@@ -14,6 +14,10 @@
#include "iree/hal/drivers/vulkan/sparse_buffer.h"
#include "iree/hal/drivers/vulkan/status_util.h"
+#if defined(IREE_PLATFORM_LINUX)
+#include <sys/mman.h>
+#endif // IREE_PLATFORM_LINUX
+
using namespace iree::hal::vulkan;
#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
@@ -608,6 +612,27 @@
"external host memory import is not supported on this device");
}
+#if defined(IREE_PLATFORM_LINUX)
+ // First check if the memory is importable.
+ // Some drivers incorrectly succeed when attempting to import already-mapped
+ // memory: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9251.
+ //
+ // Attempt to synchronize the file with its memory map.
+ // If the memory is not mapped from a file, attempting to synchronize it with
+ // its memory map should fail fast and we can import the buffer. If the memory
+ // *is* mapped, import may fail on some drivers (this may also be slow).
+
+ // TODO(scotttodd): Further restrict this slow path to buggy drivers only?
+ // We'd need to plumb some driver information through to here
+ errno = 0;
+ (void)msync(external_buffer->handle.host_allocation.ptr,
+ external_buffer->size, MS_SYNC);
+ if (errno != ENOMEM) {
+ return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+ "cannot import mapped memory");
+ }
+#endif // IREE_PLATFORM_LINUX
+
// Query the properties of the pointer to see what memory types it can be
// imported with. This can be very expensive as on some platforms it does
// a linear scan of the virtual address range to ensure all pages have the
diff --git a/tests/e2e/models/fragment_000.mlir b/tests/e2e/models/fragment_000.mlir
index 3868f41..aacdd4b 100644
--- a/tests/e2e/models/fragment_000.mlir
+++ b/tests/e2e/models/fragment_000.mlir
@@ -1,7 +1,6 @@
// RUN: iree-run-mlir --Xcompiler,iree-input-type=stablehlo --Xcompiler,iree-hal-target-backends=vmvx %s | FileCheck %s
// RUN: iree-run-mlir --Xcompiler,iree-input-type=stablehlo --Xcompiler,iree-hal-target-backends=llvm-cpu %s | FileCheck %s
-// TODO(#14776): enable Vulkan when it passes on the bots - currently fails with an unexpected out-of-memory.
-// NO_R_U_N: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir --Xcompiler,iree-input-type=stablehlo --Xcompiler,iree-hal-target-backends=vulkan-spirv %s | FileCheck %s)
+// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir --Xcompiler,iree-input-type=stablehlo --Xcompiler,iree-hal-target-backends=vulkan-spirv %s | FileCheck %s)
// RUN: [[ $IREE_METAL_DISABLE == 1 ]] || (iree-run-mlir --Xcompiler,iree-input-type=stablehlo --Xcompiler,iree-hal-target-backends=metal-spirv %s | FileCheck %s)
// CHECK-LABEL: EXEC @entry
diff --git a/tests/e2e/models/mnist_train_test/CMakeLists.txt b/tests/e2e/models/mnist_train_test/CMakeLists.txt
index 3e8ca84..a2ff8ef 100644
--- a/tests/e2e/models/mnist_train_test/CMakeLists.txt
+++ b/tests/e2e/models/mnist_train_test/CMakeLists.txt
@@ -39,18 +39,16 @@
)
endif()
-# TODO(#14776): enable Vulkan when it passes on the bots - currently fails with
-# an unexpected out-of-memory.
-# if(IREE_TARGET_BACKEND_VULKAN_SPIRV AND IREE_HAL_DRIVER_VULKAN)
-# iree_py_test(
-# NAME
-# mnist_train_test_vulkan
-# SRCS
-# "mnist_train_test.py"
-# ARGS
-# "--target_backend=vulkan-spirv"
-# "--driver=vulkan"
-# LABELS
-# "driver=vulkan"
-# )
-# endif()
+if(IREE_TARGET_BACKEND_VULKAN_SPIRV AND IREE_HAL_DRIVER_VULKAN)
+ iree_py_test(
+ NAME
+ mnist_train_test_vulkan
+ SRCS
+ "mnist_train_test.py"
+ ARGS
+ "--target_backend=vulkan-spirv"
+ "--driver=vulkan"
+ LABELS
+ "driver=vulkan"
+ )
+endif()
diff --git a/tests/e2e/tensor_ops/tensor_cast.mlir b/tests/e2e/tensor_ops/tensor_cast.mlir
index 563d576..13217b9 100644
--- a/tests/e2e/tensor_ops/tensor_cast.mlir
+++ b/tests/e2e/tensor_ops/tensor_cast.mlir
@@ -1,7 +1,6 @@
// RUN: iree-run-mlir --Xcompiler,iree-hal-target-backends=llvm-cpu %s | FileCheck %s
// RUN: [[ $IREE_VMVX_DISABLE == 1 ]] || (iree-run-mlir --Xcompiler,iree-hal-target-backends=vmvx %s | FileCheck %s)
-// TODO(#14776): enable Vulkan when it passes on the bots - currently fails with an unexpected out-of-memory.
-// NO_R_U_N: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir --Xcompiler,iree-hal-target-backends=vulkan-spirv %s | FileCheck %s)
+// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir --Xcompiler,iree-hal-target-backends=vulkan-spirv %s | FileCheck %s)
func.func @tensor_cast() -> tensor<2x?xf32> {
%input = util.unfoldable_constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf32>