Merge pull request #8534 from google/benvanik-align-64

Align HAL heap buffers to a configuration-defined minimum alignment.
diff --git a/iree/base/alignment.h b/iree/base/alignment.h
index 6f32ffe..1fd0356 100644
--- a/iree/base/alignment.h
+++ b/iree/base/alignment.h
@@ -10,6 +10,7 @@
 #ifndef IREE_BASE_ALIGNMENT_H_
 #define IREE_BASE_ALIGNMENT_H_
 
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
@@ -50,6 +51,12 @@
   return (value + (alignment - 1)) & ~(alignment - 1);
 }
 
+// Returns true if |value| matches the given minimum |alignment|.
+static inline bool iree_host_size_has_alignment(iree_host_size_t value,
+                                                iree_host_size_t alignment) {
+  return iree_host_align(value, alignment) == value;
+}
+
 // Aligns |value| up to the given power-of-two |alignment| if required.
 // https://en.wikipedia.org/wiki/Data_structure_alignment#Computing_padding
 static inline iree_device_size_t iree_device_align(
@@ -57,6 +64,12 @@
   return (value + (alignment - 1)) & ~(alignment - 1);
 }
 
+// Returns true if |value| matches the given minimum |alignment|.
+static inline bool iree_device_size_has_alignment(
+    iree_device_size_t value, iree_device_size_t alignment) {
+  return iree_device_align(value, alignment) == value;
+}
+
 // Returns the size of a struct padded out to iree_max_align_t.
 // This must be used when performing manual trailing allocation packing to
 // ensure the alignment requirements of the trailing data are satisfied.
diff --git a/iree/base/allocator.c b/iree/base/allocator.c
index f52482f..d409370 100644
--- a/iree/base/allocator.c
+++ b/iree/base/allocator.c
@@ -10,6 +10,10 @@
 #include "iree/base/api.h"
 #include "iree/base/tracing.h"
 
+//===----------------------------------------------------------------------===//
+// iree_allocator_t (std::allocator-like interface)
+//===----------------------------------------------------------------------===//
+
 static iree_status_t iree_allocator_issue_alloc(
     iree_allocator_t allocator, iree_allocator_command_t command,
     iree_host_size_t byte_length, void** inout_ptr) {
@@ -65,7 +69,7 @@
   IREE_ASSERT_ARGUMENT(params);
   IREE_ASSERT_ARGUMENT(inout_ptr);
   iree_host_size_t byte_length = params->byte_length;
-  if (byte_length == 0) {
+  if (IREE_UNLIKELY(byte_length == 0)) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "allocations must be >0 bytes");
   }
@@ -128,3 +132,130 @@
                               "unsupported system allocator command");
   }
 }
+
+//===----------------------------------------------------------------------===//
+// Aligned allocations via iree_allocator_t
+//===----------------------------------------------------------------------===//
+
+// Returns true if |alignment| is a power of two (or 0).
+static inline iree_host_size_t iree_alignment_is_pot(
+    iree_host_size_t alignment) {
+  return (alignment & (alignment - 1)) == 0;
+}
+
+// Returns a pointer into |unaligned_ptr| where |offset| matches |alignment|.
+static inline void* iree_aligned_ptr(void* unaligned_ptr,
+                                     iree_host_size_t alignment,
+                                     iree_host_size_t offset) {
+  return (void*)((((uintptr_t)unaligned_ptr + (alignment + sizeof(void*)) +
+                   offset) &
+                  ~(uintptr_t)(alignment - 1)) -
+                 offset);
+}
+
+// Returns the base unaligned pointer for |aligned_ptr|.
+static inline void* iree_aligned_ptr_get_base(void* aligned_ptr) {
+  void** ptr_ref =
+      (void**)((uintptr_t)aligned_ptr & ~(uintptr_t)(sizeof(void*) - 1));
+  return ptr_ref[-1];
+}
+
+// Sets the base unaligned pointer in |aligned_ptr|.
+static inline void iree_aligned_ptr_set_base(void* aligned_ptr,
+                                             void* base_ptr) {
+  void** ptr_ref =
+      (void**)((uintptr_t)aligned_ptr & ~(uintptr_t)(sizeof(void*) - 1));
+  ptr_ref[-1] = base_ptr;
+}
+
+IREE_API_EXPORT iree_status_t iree_allocator_malloc_aligned(
+    iree_allocator_t allocator, iree_host_size_t byte_length,
+    iree_host_size_t min_alignment, iree_host_size_t offset, void** out_ptr) {
+  IREE_ASSERT_ARGUMENT(out_ptr);
+  if (IREE_UNLIKELY(byte_length == 0)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "allocations must be >0 bytes");
+  }
+  const iree_host_size_t alignment = iree_max(min_alignment, iree_max_align_t);
+  if (IREE_UNLIKELY(!iree_alignment_is_pot(alignment))) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "alignments must be powers of two (got %" PRIhsz ")", min_alignment);
+  }
+
+  // [base ptr] [padding...] [aligned data] [padding...]
+  const iree_host_size_t total_length =
+      sizeof(uintptr_t) + byte_length + alignment;
+  void* unaligned_ptr = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(allocator, total_length, (void**)&unaligned_ptr));
+  void* aligned_ptr = iree_aligned_ptr(unaligned_ptr, alignment, offset);
+
+  iree_aligned_ptr_set_base(aligned_ptr, unaligned_ptr);
+  *out_ptr = aligned_ptr;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT iree_status_t iree_allocator_realloc_aligned(
+    iree_allocator_t allocator, iree_host_size_t byte_length,
+    iree_host_size_t min_alignment, iree_host_size_t offset, void** inout_ptr) {
+  IREE_ASSERT_ARGUMENT(inout_ptr);
+  if (!*inout_ptr) {
+    return iree_allocator_malloc_aligned(allocator, byte_length, min_alignment,
+                                         offset, inout_ptr);
+  }
+  if (IREE_UNLIKELY(byte_length == 0)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "allocations must be >0 bytes");
+  }
+  const iree_host_size_t alignment = iree_min(min_alignment, iree_max_align_t);
+  if (IREE_UNLIKELY(!iree_alignment_is_pot(alignment))) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "alignments must be powers of two (got %" PRIhsz ")", min_alignment);
+  }
+  void* aligned_ptr = *inout_ptr;
+  void* unaligned_ptr = iree_aligned_ptr_get_base(aligned_ptr);
+  if (IREE_UNLIKELY(aligned_ptr !=
+                    iree_aligned_ptr(unaligned_ptr, alignment, offset))) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "reallocation must have the same alignment as the "
+                            "original allocation (got %" PRIhsz ")",
+                            min_alignment);
+  }
+
+  // Since the reallocated memory block may have a different unaligned base to
+  // aligned offset we may need to move the data. Capture the original offset
+  // into the unaligned base where the valid data resides.
+  uintptr_t old_offset = (uintptr_t)aligned_ptr - (uintptr_t)unaligned_ptr;
+
+  // [base ptr] [padding...] [aligned data] [padding...]
+  const iree_host_size_t total_length =
+      sizeof(uintptr_t) + byte_length + alignment;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_realloc(allocator, total_length, (void**)&unaligned_ptr));
+  aligned_ptr = iree_aligned_ptr(unaligned_ptr, alignment, offset);
+
+  const uint8_t* old_data = (uint8_t*)unaligned_ptr + old_offset;
+  uint8_t* new_data = (uint8_t*)aligned_ptr;
+  if (old_data != new_data) {
+    // Alignment at offset changed; copy data to the new aligned offset.
+    // NOTE: this is copying up to the *new* byte length, as we don't store the
+    // old length and don't know how much to copy. Since we've already
+    // reallocated we know this will always be in-bounds, but it's inefficient.
+    // NOTE: memmove instead of memcpy as the regions may overlap.
+    memmove(new_data, old_data, byte_length);
+  }
+
+  iree_aligned_ptr_set_base(aligned_ptr, unaligned_ptr);
+  *inout_ptr = aligned_ptr;
+  return iree_ok_status();
+}
+
+IREE_API_EXPORT void iree_allocator_free_aligned(iree_allocator_t allocator,
+                                                 void* ptr) {
+  if (ptr) {
+    void* unaligned_ptr = iree_aligned_ptr_get_base(ptr);
+    iree_allocator_free(allocator, unaligned_ptr);
+  }
+}
diff --git a/iree/base/allocator.h b/iree/base/allocator.h
index 6b71d54..9ac26f4 100644
--- a/iree/base/allocator.h
+++ b/iree/base/allocator.h
@@ -117,46 +117,6 @@
 #endif  // IREE_COMPILER_MSVC
 
 //===----------------------------------------------------------------------===//
-// C11 aligned_alloc compatibility shim
-//===----------------------------------------------------------------------===//
-
-#if defined(IREE_PLATFORM_WINDOWS)
-// https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc
-#define iree_aligned_alloc(alignment, size) _aligned_malloc(size, alignment)
-#define iree_aligned_free(p) _aligned_free(p)
-#elif defined(_ISOC11_SOURCE)
-// https://en.cppreference.com/w/c/memory/aligned_alloc
-#define iree_aligned_alloc(alignment, size) aligned_alloc(alignment, size)
-#define iree_aligned_free(p) free(p)
-#elif _POSIX_C_SOURCE >= 200112L
-// https://pubs.opengroup.org/onlinepubs/9699919799/functions/posix_memalign.html
-static inline void* iree_aligned_alloc(size_t alignment, size_t size) {
-  void* ptr = NULL;
-  return posix_memalign(&ptr, alignment, size) == 0 ? ptr : NULL;
-}
-#define iree_aligned_free(p) free(p)
-#else
-// Emulates alignment with normal malloc. We overallocate by at least the
-// alignment + the size of a pointer, store the base pointer at p[-1], and
-// return the aligned pointer. This lets us easily get the base pointer in free
-// to pass back to the system.
-static inline void* iree_aligned_alloc(size_t alignment, size_t size) {
-  void* base_ptr = malloc(size + alignment + sizeof(uintptr_t));
-  if (!base_ptr) return NULL;
-  uintptr_t* aligned_ptr = (uintptr_t*)iree_host_align(
-      (uintptr_t)base_ptr + sizeof(uintptr_t), alignment);
-  aligned_ptr[-1] = (uintptr_t)base_ptr;
-  return aligned_ptr;
-}
-static inline void iree_aligned_free(void* p) {
-  if (IREE_UNLIKELY(!p)) return;
-  uintptr_t* aligned_ptr = (uintptr_t*)p;
-  void* base_ptr = (void*)aligned_ptr[-1];
-  free(base_ptr);
-}
-#endif  // IREE_PLATFORM_WINDOWS
-
-//===----------------------------------------------------------------------===//
 // iree_allocator_t (std::allocator-like interface)
 //===----------------------------------------------------------------------===//
 
@@ -246,6 +206,7 @@
 // If the reallocation fails then the original |inout_ptr| is unmodified.
 //
 // WARNING: when extending the newly allocated bytes are undefined.
+// TODO(benvanik): make them zeros; we should have an _uninitialized if needed.
 IREE_API_EXPORT iree_status_t iree_allocator_realloc(
     iree_allocator_t allocator, iree_host_size_t byte_length, void** inout_ptr);
 
@@ -281,6 +242,43 @@
   return allocator.ctl == NULL;
 }
 
+//===----------------------------------------------------------------------===//
+// Aligned allocations via iree_allocator_t
+//===----------------------------------------------------------------------===//
+
+// Allocates memory of size |byte_length| where the byte starting at |offset|
+// has a minimum alignment of |min_alignment|. In many cases |offset| can be 0.
+//
+// The |offset| can be used to ensure the alignment-sensitive portion of a
+// combined allocation is aligned while any prefix metadata has system
+// alignment. For example:
+//   typedef struct {
+//     uint32_t some_metadata;
+//     uint8_t data[];
+//   } buffer_t;
+//   buffer_t* buffer = NULL;
+//   iree_allocator_malloc_aligned(allocator, sizeof(buffer_t) + length,
+//                                 4096, offsetof(buffer_t, data), &buffer);
+//   // `buffer` has system alignment, but the `data` will be aligned on at
+//   // least a 4096 boundary.
+//
+// The contents of the returned memory is guaranteed to be zeroed.
+IREE_API_EXPORT iree_status_t iree_allocator_malloc_aligned(
+    iree_allocator_t allocator, iree_host_size_t byte_length,
+    iree_host_size_t min_alignment, iree_host_size_t offset, void** out_ptr);
+
+// Reallocates memory to |byte_length|, growing or shrinking as needed.
+// Only valid on memory allocated with iree_allocator_malloc_aligned.
+// The newly reallocated memory will have the byte at |offset| aligned to at
+// least |min_alignment|.
+IREE_API_EXPORT iree_status_t iree_allocator_realloc_aligned(
+    iree_allocator_t allocator, iree_host_size_t byte_length,
+    iree_host_size_t min_alignment, iree_host_size_t offset, void** inout_ptr);
+
+// Frees a |ptr| previously returned from iree_allocator_malloc_aligned.
+IREE_API_EXPORT void iree_allocator_free_aligned(iree_allocator_t allocator,
+                                                 void* ptr);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/iree/base/config.h b/iree/base/config.h
index 022a522..e82c02d 100644
--- a/iree/base/config.h
+++ b/iree/base/config.h
@@ -155,6 +155,14 @@
 // Enables optional HAL features. Each of these may add several KB to the final
 // binary when linked dynamically.
 
+#if !defined(IREE_HAL_HEAP_BUFFER_ALIGNMENT)
+// Power of two byte alignment required on all host heap buffers.
+// Executables are compiled with alignment expectations and the runtime
+// alignment must be greater than or equal to the alignment set in the compiler.
+// External buffers wrapped by HAL buffers must meet this alignment requirement.
+#define IREE_HAL_HEAP_BUFFER_ALIGNMENT 64
+#endif  // IREE_HAL_HEAP_BUFFER_ALIGNMENT
+
 #if !defined(IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE)
 // Enables additional validation of commands issued against command buffers.
 // This adds small amounts of per-command overhead but in all but the most
diff --git a/iree/base/status.c b/iree/base/status.c
index 341630e..d71ba53 100644
--- a/iree/base/status.c
+++ b/iree/base/status.c
@@ -23,6 +23,46 @@
 #include "iree/base/tracing.h"
 
 //===----------------------------------------------------------------------===//
+// C11 aligned_alloc compatibility shim
+//===----------------------------------------------------------------------===//
+
+#if defined(IREE_PLATFORM_WINDOWS)
+// https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc
+#define iree_aligned_alloc(alignment, size) _aligned_malloc(size, alignment)
+#define iree_aligned_free(p) _aligned_free(p)
+#elif defined(_ISOC11_SOURCE)
+// https://en.cppreference.com/w/c/memory/aligned_alloc
+#define iree_aligned_alloc(alignment, size) aligned_alloc(alignment, size)
+#define iree_aligned_free(p) free(p)
+#elif _POSIX_C_SOURCE >= 200112L
+// https://pubs.opengroup.org/onlinepubs/9699919799/functions/posix_memalign.html
+static inline void* iree_aligned_alloc(size_t alignment, size_t size) {
+  void* ptr = NULL;
+  return posix_memalign(&ptr, alignment, size) == 0 ? ptr : NULL;
+}
+#define iree_aligned_free(p) free(p)
+#else
+// Emulates alignment with normal malloc. We overallocate by at least the
+// alignment + the size of a pointer, store the base pointer at p[-1], and
+// return the aligned pointer. This lets us easily get the base pointer in free
+// to pass back to the system.
+static inline void* iree_aligned_alloc(size_t alignment, size_t size) {
+  void* base_ptr = malloc(size + alignment + sizeof(uintptr_t));
+  if (!base_ptr) return NULL;
+  uintptr_t* aligned_ptr = (uintptr_t*)iree_host_align(
+      (uintptr_t)base_ptr + sizeof(uintptr_t), alignment);
+  aligned_ptr[-1] = (uintptr_t)base_ptr;
+  return aligned_ptr;
+}
+static inline void iree_aligned_free(void* p) {
+  if (IREE_UNLIKELY(!p)) return;
+  uintptr_t* aligned_ptr = (uintptr_t*)p;
+  void* base_ptr = (void*)aligned_ptr[-1];
+  free(base_ptr);
+}
+#endif  // IREE_PLATFORM_WINDOWS
+
+//===----------------------------------------------------------------------===//
 // iree_status_t canonical errors
 //===----------------------------------------------------------------------===//
 
diff --git a/iree/hal/buffer.h b/iree/hal/buffer.h
index f41e2a5..fb23ce0 100644
--- a/iree/hal/buffer.h
+++ b/iree/hal/buffer.h
@@ -517,22 +517,6 @@
     iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer);
 
 //===----------------------------------------------------------------------===//
-// iree_hal_heap_buffer_t
-//===----------------------------------------------------------------------===//
-
-// Wraps an existing host allocation in a buffer.
-// When the buffer is destroyed the provided |data_allocator| will be used to
-// free |data|. Pass iree_allocator_null() to wrap without ownership semantics.
-//
-// |out_buffer| must be released by the caller.
-IREE_API_EXPORT iree_status_t iree_hal_heap_buffer_wrap(
-    iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
-    iree_hal_memory_access_t allowed_access,
-    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
-    iree_byte_span_t data, iree_allocator_t data_allocator,
-    iree_hal_buffer_t** out_buffer);
-
-//===----------------------------------------------------------------------===//
 // iree_hal_buffer_t implementation details
 //===----------------------------------------------------------------------===//
 
diff --git a/iree/hal/buffer_heap.c b/iree/hal/buffer_heap.c
index 7d3ec2f..e2c3ee8 100644
--- a/iree/hal/buffer_heap.c
+++ b/iree/hal/buffer_heap.c
@@ -27,6 +27,31 @@
 
 static const iree_hal_buffer_vtable_t iree_hal_heap_buffer_vtable;
 
+enum {
+  IREE_HAL_HEAP_BUFFER_DATA_IS_ALIGNED = 1u << 0,
+  IREE_HAL_HEAP_BUFFER_METADATA_IS_ALIGNED = 1u << 1,
+  IREE_HAL_HEAP_BUFFER_FLAG_MASK = IREE_HAL_HEAP_BUFFER_DATA_IS_ALIGNED |
+                                   IREE_HAL_HEAP_BUFFER_METADATA_IS_ALIGNED,
+};
+
+static inline uint8_t* iree_hal_heap_buffer_ptr(
+    const iree_hal_heap_buffer_t* buffer) {
+  return (uint8_t*)((uintptr_t)buffer->data.data &
+                    ~IREE_HAL_HEAP_BUFFER_FLAG_MASK);
+}
+
+static inline bool iree_hal_heap_buffer_data_is_aligned(
+    const iree_hal_heap_buffer_t* buffer) {
+  return iree_any_bit_set((uintptr_t)buffer->data.data,
+                          IREE_HAL_HEAP_BUFFER_DATA_IS_ALIGNED);
+}
+
+static inline bool iree_hal_heap_buffer_metadata_is_aligned(
+    const iree_hal_heap_buffer_t* buffer) {
+  return iree_any_bit_set((uintptr_t)buffer->data.data,
+                          IREE_HAL_HEAP_BUFFER_METADATA_IS_ALIGNED);
+}
+
 // Allocates a buffer with the metadata and storage split.
 // This results in an additional host allocation but allows for user-overridden
 // data storage allocations.
@@ -35,16 +60,23 @@
     iree_allocator_t host_allocator, iree_hal_heap_buffer_t** out_buffer,
     iree_byte_span_t* out_data) {
   // Try allocating the storage first as it's the most likely to fail if OOM.
+  // It must be aligned to the minimum buffer alignment.
   out_data->data_length = allocation_size;
-  IREE_RETURN_IF_ERROR(iree_allocator_malloc(data_allocator, allocation_size,
-                                             (void**)&out_data->data));
+  uintptr_t data_ptr = 0;
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc_aligned(
+      data_allocator, allocation_size, IREE_HAL_HEAP_BUFFER_ALIGNMENT,
+      /*offset=*/0, (void**)&data_ptr));
+  IREE_ASSERT_TRUE(
+      iree_host_size_has_alignment(data_ptr, IREE_HAL_HEAP_BUFFER_ALIGNMENT));
+  data_ptr |= IREE_HAL_HEAP_BUFFER_DATA_IS_ALIGNED;
+  out_data->data = (uint8_t*)data_ptr;
 
-  // Allocate the host metadata wrapper.
+  // Allocate the host metadata wrapper with natural alignment.
   iree_status_t status = iree_allocator_malloc(
       host_allocator, sizeof(**out_buffer), (void**)out_buffer);
   if (!iree_status_is_ok(status)) {
     // Need to free the storage we just allocated.
-    iree_allocator_free(data_allocator, out_data->data);
+    iree_allocator_free_aligned(data_allocator, out_data->data);
   }
   return status;
 }
@@ -55,16 +87,29 @@
 static iree_status_t iree_hal_heap_buffer_allocate_slab(
     iree_device_size_t allocation_size, iree_allocator_t host_allocator,
     iree_hal_heap_buffer_t** out_buffer, iree_byte_span_t* out_data) {
-  // NOTE: we want the buffer data to always be 16-byte aligned.
+  // The metadata header is always aligned and we want to ensure it's padded
+  // out to the max alignment.
   iree_hal_heap_buffer_t* buffer = NULL;
   iree_host_size_t header_size =
-      iree_host_align(iree_sizeof_struct(*buffer), 16);
+      iree_host_align(iree_sizeof_struct(*buffer), iree_max_align_t);
   iree_host_size_t total_size = header_size + allocation_size;
-  IREE_RETURN_IF_ERROR(
-      iree_allocator_malloc(host_allocator, total_size, (void**)&buffer));
+
+  // Allocate with the data starting at offset header_size aligned to the
+  // minimum required buffer alignment. The header itself will still be aligned
+  // to the natural alignment but our buffer alignment is often much larger.
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc_aligned(
+      host_allocator, total_size, IREE_HAL_HEAP_BUFFER_ALIGNMENT, header_size,
+      (void**)&buffer));
   *out_buffer = buffer;
-  *out_data =
-      iree_make_byte_span((uint8_t*)buffer + header_size, allocation_size);
+
+  // Set bit indicating that we need to free the metadata with
+  // iree_allocator_free_aligned.
+  uintptr_t data_ptr = (uintptr_t)buffer + header_size;
+  IREE_ASSERT_TRUE(
+      iree_host_size_has_alignment(data_ptr, IREE_HAL_HEAP_BUFFER_ALIGNMENT));
+  data_ptr |= IREE_HAL_HEAP_BUFFER_METADATA_IS_ALIGNED;
+  *out_data = iree_make_byte_span((uint8_t*)data_ptr, allocation_size);
+
   return iree_ok_status();
 }
 
@@ -82,7 +127,7 @@
   // If the data and host allocators are the same we can allocate more
   // efficiently as a large slab. Otherwise we need to allocate both the
   // metadata and the storage independently.
-  bool same_allocator =
+  const bool same_allocator =
       memcmp(&data_allocator, &host_allocator, sizeof(data_allocator)) == 0;
 
   iree_hal_heap_buffer_t* buffer = NULL;
@@ -120,16 +165,27 @@
   return status;
 }
 
-IREE_API_EXPORT iree_status_t iree_hal_heap_buffer_wrap(
-    iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type,
-    iree_hal_memory_access_t allowed_access,
-    iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size,
-    iree_byte_span_t data, iree_allocator_t data_allocator,
-    iree_hal_buffer_t** out_buffer) {
+iree_status_t iree_hal_heap_buffer_wrap(iree_hal_allocator_t* allocator,
+                                        iree_hal_memory_type_t memory_type,
+                                        iree_hal_memory_access_t allowed_access,
+                                        iree_hal_buffer_usage_t allowed_usage,
+                                        iree_device_size_t allocation_size,
+                                        iree_byte_span_t data,
+                                        iree_allocator_t data_allocator,
+                                        iree_hal_buffer_t** out_buffer) {
   IREE_ASSERT_ARGUMENT(allocator);
   IREE_ASSERT_ARGUMENT(out_buffer);
   IREE_TRACE_ZONE_BEGIN(z0);
 
+  uintptr_t data_ptr = (uintptr_t)data.data & ~IREE_HAL_HEAP_BUFFER_FLAG_MASK;
+  if (!iree_host_size_has_alignment(data_ptr, IREE_HAL_HEAP_BUFFER_ALIGNMENT)) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "imported heap buffer data must be aligned to %d; got %p",
+        (int)IREE_HAL_HEAP_BUFFER_ALIGNMENT, (void*)data_ptr);
+  }
+
   iree_allocator_t host_allocator =
       iree_hal_allocator_host_allocator(allocator);
   iree_hal_heap_buffer_t* buffer = NULL;
@@ -164,8 +220,16 @@
     }
   });
 
-  iree_allocator_free(buffer->data_allocator, buffer->data.data);
-  iree_allocator_free(host_allocator, buffer);
+  if (iree_hal_heap_buffer_data_is_aligned(buffer)) {
+    iree_allocator_free_aligned(buffer->data_allocator, buffer->data.data);
+  } else {
+    iree_allocator_free(buffer->data_allocator, buffer->data.data);
+  }
+  if (iree_hal_heap_buffer_metadata_is_aligned(buffer)) {
+    iree_allocator_free_aligned(host_allocator, buffer);
+  } else {
+    iree_allocator_free(host_allocator, buffer);
+  }
 
   IREE_TRACE_ZONE_END(z0);
 }
@@ -176,8 +240,8 @@
     iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length,
     iree_hal_buffer_mapping_t* mapping) {
   iree_hal_heap_buffer_t* buffer = (iree_hal_heap_buffer_t*)base_buffer;
-  mapping->contents = iree_make_byte_span(buffer->data.data + local_byte_offset,
-                                          local_byte_length);
+  mapping->contents = iree_make_byte_span(
+      iree_hal_heap_buffer_ptr(buffer) + local_byte_offset, local_byte_length);
 
   // If we mapped for discard scribble over the bytes. This is not a mandated
   // behavior but it will make debugging issues easier. Alternatively for
diff --git a/iree/hal/buffer_heap_impl.h b/iree/hal/buffer_heap_impl.h
index 5a2c5ac..5ade6e6 100644
--- a/iree/hal/buffer_heap_impl.h
+++ b/iree/hal/buffer_heap_impl.h
@@ -38,6 +38,26 @@
     iree_allocator_t data_allocator, iree_allocator_t host_allocator,
     iree_hal_buffer_t** out_buffer);
 
+// Wraps an existing host allocation in a buffer.
+// When the buffer is destroyed the provided |data_allocator| will be used to
+// free |data| using iree_allocator_free. Pass iree_allocator_null() to wrap
+// without ownership semantics.
+//
+// The buffer must be aligned to at least IREE_HAL_HEAP_BUFFER_ALIGNMENT.
+// Note that it will be freed as a normal unaligned allocation. If we find
+// ourselves wanting to wrap aligned allocations requiring
+// iree_allocator_free_aligned then we'll need a flag to indicate that.
+//
+// |out_buffer| must be released by the caller.
+iree_status_t iree_hal_heap_buffer_wrap(iree_hal_allocator_t* allocator,
+                                        iree_hal_memory_type_t memory_type,
+                                        iree_hal_memory_access_t allowed_access,
+                                        iree_hal_buffer_usage_t allowed_usage,
+                                        iree_device_size_t allocation_size,
+                                        iree_byte_span_t data,
+                                        iree_allocator_t data_allocator,
+                                        iree_hal_buffer_t** out_buffer);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/iree/hal/cts/buffer_mapping_test.h b/iree/hal/cts/buffer_mapping_test.h
index 652d010..0019c67 100644
--- a/iree/hal/cts/buffer_mapping_test.h
+++ b/iree/hal/cts/buffer_mapping_test.h
@@ -541,7 +541,6 @@
 }
 
 // TODO(scotttodd): iree_hal_allocator_wrap_buffer
-// TODO(scotttodd): iree_hal_heap_buffer_wrap
 
 }  // namespace cts
 }  // namespace hal
diff --git a/iree/runtime/demo/hello_world_explained.c b/iree/runtime/demo/hello_world_explained.c
index 886aec1..2a981aa 100644
--- a/iree/runtime/demo/hello_world_explained.c
+++ b/iree/runtime/demo/hello_world_explained.c
@@ -191,7 +191,8 @@
     iree_hal_buffer_view_t* arg0 = NULL;
     if (iree_status_is_ok(status)) {
       static const iree_hal_dim_t arg0_shape[1] = {4};
-      static const float arg0_data[4] = {1.0f, 1.1f, 1.2f, 1.3f};
+      static const float iree_alignas(64)
+          arg0_data[4] = {1.0f, 1.1f, 1.2f, 1.3f};
       status = iree_hal_buffer_view_wrap_or_clone_heap_buffer(
           device_allocator,
           // Shape dimensions and rank:
@@ -230,7 +231,8 @@
     iree_hal_buffer_view_t* arg1 = NULL;
     if (iree_status_is_ok(status)) {
       static const iree_hal_dim_t arg1_shape[1] = {4};
-      static const float arg1_data[4] = {10.0f, 100.0f, 1000.0f, 10000.0f};
+      static const float iree_alignas(64)
+          arg1_data[4] = {10.0f, 100.0f, 1000.0f, 10000.0f};
       status = iree_hal_buffer_view_wrap_or_clone_heap_buffer(
           device_allocator, arg1_shape, IREE_ARRAYSIZE(arg1_shape),
           IREE_HAL_ELEMENT_TYPE_FLOAT_32,
diff --git a/iree/runtime/demo/hello_world_terse.c b/iree/runtime/demo/hello_world_terse.c
index 2f715bc..121a2e5 100644
--- a/iree/runtime/demo/hello_world_terse.c
+++ b/iree/runtime/demo/hello_world_terse.c
@@ -78,7 +78,8 @@
   // %arg0: tensor<4xf32>
   iree_hal_buffer_view_t* arg0 = NULL;
   static const iree_hal_dim_t arg0_shape[1] = {4};
-  static const float arg0_data[4] = {1.0f, 1.1f, 1.2f, 1.3f};
+  static const float iree_alignas(IREE_HAL_HEAP_BUFFER_ALIGNMENT)
+      arg0_data[4] = {1.0f, 1.1f, 1.2f, 1.3f};
   IREE_CHECK_OK(iree_hal_buffer_view_wrap_or_clone_heap_buffer(
       iree_runtime_session_device_allocator(session), arg0_shape,
       IREE_ARRAYSIZE(arg0_shape), IREE_HAL_ELEMENT_TYPE_FLOAT_32,
@@ -102,7 +103,8 @@
   // %arg1: tensor<4xf32>
   iree_hal_buffer_view_t* arg1 = NULL;
   static const iree_hal_dim_t arg1_shape[1] = {4};
-  static const float arg1_data[4] = {10.0f, 100.0f, 1000.0f, 10000.0f};
+  static const float iree_alignas(IREE_HAL_HEAP_BUFFER_ALIGNMENT)
+      arg1_data[4] = {10.0f, 100.0f, 1000.0f, 10000.0f};
   IREE_CHECK_OK(iree_hal_buffer_view_wrap_or_clone_heap_buffer(
       iree_runtime_session_device_allocator(session), arg1_shape,
       IREE_ARRAYSIZE(arg1_shape), IREE_HAL_ELEMENT_TYPE_FLOAT_32,
diff --git a/iree/samples/custom_modules/custom_modules_test.cc b/iree/samples/custom_modules/custom_modules_test.cc
index cf4e306..df2f190 100644
--- a/iree/samples/custom_modules/custom_modules_test.cc
+++ b/iree/samples/custom_modules/custom_modules_test.cc
@@ -131,8 +131,8 @@
 TEST_F(CustomModulesTest, PrintTensor) {
   // Allocate the buffer we'll be printing.
   static iree_hal_dim_t kShape[] = {2, 4};
-  static float kBufferContents[2 * 4] = {0.0f, 1.0f, 2.0f, 3.0f,
-                                         4.0f, 5.0f, 6.0f, 7.0f};
+  static float iree_alignas(IREE_HAL_HEAP_BUFFER_ALIGNMENT)
+      kBufferContents[2 * 4] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
   iree_hal_buffer_params_t params = {0};
   params.type =
       IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE;
@@ -179,8 +179,8 @@
 TEST_F(CustomModulesTest, RoundTripTensor) {
   // Allocate the buffer we'll be printing/parsing.
   static iree_hal_dim_t kShape[] = {2, 4};
-  static float kBufferContents[2 * 4] = {0.0f, 1.0f, 2.0f, 3.0f,
-                                         4.0f, 5.0f, 6.0f, 7.0f};
+  static float iree_alignas(IREE_HAL_HEAP_BUFFER_ALIGNMENT)
+      kBufferContents[2 * 4] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
   iree_hal_buffer_params_t params = {0};
   params.type =
       IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE;