Adds iree_unaligned_load+store to emulate unaligned access. (#6573)

Possible fix for #6566.
diff --git a/iree/base/alignment.h b/iree/base/alignment.h
index 215db3c..5229d97 100644
--- a/iree/base/alignment.h
+++ b/iree/base/alignment.h
@@ -11,6 +11,8 @@
 #define IREE_BASE_ALIGNMENT_H_
 
 #include <stddef.h>
+#include <stdint.h>
+#include <string.h>
 
 #include "iree/base/config.h"
 #include "iree/base/target_platform.h"
@@ -69,6 +71,164 @@
 //  IREE_CHECK_OK(iree_allocator_malloc(allocator, total_size, (void**)&p));
 #define iree_sizeof_struct(t) iree_host_align(sizeof(t), iree_max_align_t)
 
+//===----------------------------------------------------------------------===//
+// Alignment-safe memory accesses
+//===----------------------------------------------------------------------===//
+
+// Map little-endian byte indices in memory to the host memory order indices.
+#if defined(IREE_ENDIANNESS_LITTLE)
+#define IREE_LE_IDX_1(i) (i)
+#define IREE_LE_IDX_2(i) (i)
+#define IREE_LE_IDX_4(i) (i)
+#define IREE_LE_IDX_8(i) (i)
+#else
+#define IREE_LE_IDX_1(i) (i)
+#define IREE_LE_IDX_2(i) (1 - (i))
+#define IREE_LE_IDX_4(i) (3 - (i))
+#define IREE_LE_IDX_8(i) (7 - (i))
+#endif  // IREE_ENDIANNESS_*
+
+#if IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED
+
+static inline uint8_t iree_unaligned_load_le_u8(const uint8_t* ptr) {
+  return *ptr;
+}
+static inline uint16_t iree_unaligned_load_le_u16(const uint16_t* ptr) {
+  const uint8_t* p = (const uint8_t*)ptr;
+  return ((uint16_t)p[IREE_LE_IDX_2(0)]) | ((uint16_t)p[IREE_LE_IDX_2(1)] << 8);
+}
+static inline uint32_t iree_unaligned_load_le_u32(const uint32_t* ptr) {
+  const uint8_t* p = (const uint8_t*)ptr;
+  return ((uint32_t)p[IREE_LE_IDX_4(0)]) |
+         ((uint32_t)p[IREE_LE_IDX_4(1)] << 8) |
+         ((uint32_t)p[IREE_LE_IDX_4(2)] << 16) |
+         ((uint32_t)p[IREE_LE_IDX_4(3)] << 24);
+}
+static inline uint64_t iree_unaligned_load_le_u64(const uint64_t* ptr) {
+  const uint8_t* p = (const uint8_t*)ptr;
+  return ((uint64_t)p[IREE_LE_IDX_8(0)]) |
+         ((uint64_t)p[IREE_LE_IDX_8(1)] << 8) |
+         ((uint64_t)p[IREE_LE_IDX_8(2)] << 16) |
+         ((uint64_t)p[IREE_LE_IDX_8(3)] << 24) |
+         ((uint64_t)p[IREE_LE_IDX_8(4)] << 32) |
+         ((uint64_t)p[IREE_LE_IDX_8(5)] << 40) |
+         ((uint64_t)p[IREE_LE_IDX_8(6)] << 48) |
+         ((uint64_t)p[IREE_LE_IDX_8(7)] << 56);
+}
+static inline float iree_unaligned_load_le_f32(const float* ptr) {
+  uint32_t uint_value = iree_unaligned_load_le_u32((const uint32_t*)ptr);
+  float value;
+  memcpy(&value, &uint_value, sizeof(value));
+  return value;
+}
+static inline double iree_unaligned_load_le_f64(const double* ptr) {
+  uint64_t uint_value = iree_unaligned_load_le_u64((const uint64_t*)ptr);
+  double value;
+  memcpy(&value, &uint_value, sizeof(value));
+  return value;
+}
+
+static inline void iree_unaligned_store_le_u8(uint8_t* ptr, uint8_t value) {
+  *ptr = value;
+}
+static inline void iree_unaligned_store_le_u16(uint16_t* ptr, uint16_t value) {
+  uint8_t* p = (uint8_t*)ptr;
+  p[IREE_LE_IDX_2(0)] = value;
+  p[IREE_LE_IDX_2(1)] = value >> 8;
+}
+static inline void iree_unaligned_store_le_u32(uint32_t* ptr, uint32_t value) {
+  uint8_t* p = (uint8_t*)ptr;
+  p[IREE_LE_IDX_4(0)] = value;
+  p[IREE_LE_IDX_4(1)] = value >> 8;
+  p[IREE_LE_IDX_4(2)] = value >> 16;
+  p[IREE_LE_IDX_4(3)] = value >> 24;
+}
+static inline void iree_unaligned_store_le_u64(uint64_t* ptr, uint64_t value) {
+  uint8_t* p = (uint8_t*)ptr;
+  p[IREE_LE_IDX_8(0)] = value;
+  p[IREE_LE_IDX_8(1)] = value >> 8;
+  p[IREE_LE_IDX_8(2)] = value >> 16;
+  p[IREE_LE_IDX_8(3)] = value >> 24;
+  p[IREE_LE_IDX_8(4)] = value >> 32;
+  p[IREE_LE_IDX_8(5)] = value >> 40;
+  p[IREE_LE_IDX_8(6)] = value >> 48;
+  p[IREE_LE_IDX_8(7)] = value >> 56;
+}
+static inline void iree_unaligned_store_le_f32(float* ptr, float value) {
+  uint32_t uint_value;
+  memcpy(&uint_value, &value, sizeof(value));
+  iree_unaligned_store_le_u32((uint32_t*)ptr, uint_value);
+}
+static inline void iree_unaligned_store_le_f64(double* ptr, double value) {
+  uint64_t uint_value;
+  memcpy(&uint_value, &value, sizeof(value));
+  iree_unaligned_store_le_u64((uint64_t*)ptr, uint_value);
+}
+
+#else
+
+#if defined(IREE_ENDIANNESS_LITTLE)
+
+#define iree_unaligned_load_le_u8(ptr) *(ptr)
+#define iree_unaligned_load_le_u16(ptr) *(ptr)
+#define iree_unaligned_load_le_u32(ptr) *(ptr)
+#define iree_unaligned_load_le_u64(ptr) *(ptr)
+#define iree_unaligned_load_le_f32(ptr) *(ptr)
+#define iree_unaligned_load_le_f64(ptr) *(ptr)
+
+#define iree_unaligned_store_le_u8(ptr, value) *(ptr) = (value)
+#define iree_unaligned_store_le_u16(ptr, value) *(ptr) = (value)
+#define iree_unaligned_store_le_u32(ptr, value) *(ptr) = (value)
+#define iree_unaligned_store_le_u64(ptr, value) *(ptr) = (value)
+#define iree_unaligned_store_le_f32(ptr, value) *(ptr) = (value)
+#define iree_unaligned_store_le_f64(ptr, value) *(ptr) = (value)
+
+#else
+
+#error "TODO(benvanik): little-endian load/store for big-endian archs"
+
+#endif  // IREE_ENDIANNESS_*
+
+#endif  // IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED
+
+// clang-format off
+
+// Dereferences |ptr| and returns the value.
+// Automatically handles unaligned accesses on architectures that may not
+// support them natively (or efficiently). Memory is treated as little-endian.
+#define iree_unaligned_load_le(ptr)                                               \
+  _Generic((ptr),                                                              \
+        int8_t*: iree_unaligned_load_le_u8((const uint8_t*)(ptr)),             \
+       uint8_t*: iree_unaligned_load_le_u8((const uint8_t*)(ptr)),             \
+       int16_t*: iree_unaligned_load_le_u16((const uint16_t*)(ptr)),           \
+      uint16_t*: iree_unaligned_load_le_u16((const uint16_t*)(ptr)),           \
+       int32_t*: iree_unaligned_load_le_u32((const uint32_t*)(ptr)),           \
+      uint32_t*: iree_unaligned_load_le_u32((const uint32_t*)(ptr)),           \
+       int64_t*: iree_unaligned_load_le_u64((const uint64_t*)(ptr)),           \
+      uint64_t*: iree_unaligned_load_le_u64((const uint64_t*)(ptr)),           \
+         float*: iree_unaligned_load_le_f32((const float*)(ptr)),              \
+        double*: iree_unaligned_load_le_f64((const double*)(ptr))              \
+  )
+
+// Dereferences |ptr| and writes the given |value|.
+// Automatically handles unaligned accesses on architectures that may not
+// support them natively (or efficiently). Memory is treated as little-endian.
+#define iree_unaligned_store(ptr, value)                                       \
+  _Generic((ptr),                                                              \
+        int8_t*: iree_unaligned_store_le_u8((uint8_t*)(ptr), value),           \
+       uint8_t*: iree_unaligned_store_le_u8((uint8_t*)(ptr), value),           \
+       int16_t*: iree_unaligned_store_le_u16((uint16_t*)(ptr), value),         \
+      uint16_t*: iree_unaligned_store_le_u16((uint16_t*)(ptr), value),         \
+       int32_t*: iree_unaligned_store_le_u32((uint32_t*)(ptr), value),         \
+      uint32_t*: iree_unaligned_store_le_u32((uint32_t*)(ptr), value),         \
+       int64_t*: iree_unaligned_store_le_u64((uint64_t*)(ptr), value),         \
+      uint64_t*: iree_unaligned_store_le_u64((uint64_t*)(ptr), value),         \
+         float*: iree_unaligned_store_le_f32((float*)(ptr), value),            \
+        double*: iree_unaligned_store_le_f64((double*)(ptr), value)            \
+  )
+
+// clang-format on
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/iree/base/target_platform.h b/iree/base/target_platform.h
index 4277d71..8052967 100644
--- a/iree/base/target_platform.h
+++ b/iree/base/target_platform.h
@@ -30,6 +30,8 @@
 // IREE_ENDIANNESS_LITTLE
 // IREE_ENDIANNESS_BIG
 //
+// IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED (0/1)
+//
 // IREE_COMPILER_CLANG
 // IREE_COMPILER_GCC
 // IREE_COMPILER_GCC_COMPAT
@@ -138,6 +140,39 @@
 #endif  // __BYTE_ORDER__
 
 //==============================================================================
+// IREE_MEMORY_ACCESS_*
+//==============================================================================
+// Certain architectures have specific memory access requirements that require
+// user-mode code changes to work at all or work at reasonable performance.
+
+#if !defined(IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED)
+
+#if defined(IREE_ARCH_ARM_32) || defined(IREE_ARCH_ARM_64)
+
+// Armv6‑M and Armv8-M (w/o the main extension) do not support unaligned access.
+// The -munaligned-access and -mno-unaligned-access flags control this.
+// https://www.keil.com/support/man/docs/armclang_ref/armclang_ref_sam1444138667173.htm
+#if !defined(__ARM_FEATURE_UNALIGNED)
+#define IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED 1
+#else
+#define IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED 0
+#endif  // !__ARM_FEATURE_UNALIGNED
+
+#elif defined(IREE_ARCH_RISCV_32) || defined(IREE_ARCH_RISCV_64)
+
+// Though unaligned access is part of the base spec it is allowed to be
+// implemented with trap handlers. Bare-metal systems likely won't have these
+// handlers and even on systems that do (linux) we don't want to be trapping for
+// every load/store.
+#define IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED 1
+
+#endif  // IREE_ARCH_*
+
+#else
+#define IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED 0
+#endif  // !IREE_MEMORY_ACCESS_ALIGNMENT_REQUIRED
+
+//==============================================================================
 // IREE_COMPILER_*
 //==============================================================================
 
diff --git a/iree/vm/bytecode_dispatch_util.h b/iree/vm/bytecode_dispatch_util.h
index 2e692be..596968e 100644
--- a/iree/vm/bytecode_dispatch_util.h
+++ b/iree/vm/bytecode_dispatch_util.h
@@ -153,34 +153,12 @@
 
 // Bytecode data access macros for reading values of a given type from a byte
 // offset within the current function.
-#if defined(IREE_ENDIANNESS_LITTLE)
-#define OP_I8(i) bytecode_data[pc + (i)]
-#define OP_I16(i) *((uint16_t*)&bytecode_data[pc + (i)])
-#define OP_I32(i) *((uint32_t*)&bytecode_data[pc + (i)])
-#define OP_I64(i) *((uint64_t*)&bytecode_data[pc + (i)])
-#define OP_F32(i) *((float*)&bytecode_data[pc + (i)])
-#define OP_F64(i) *((double*)&bytecode_data[pc + (i)])
-#else
-#define OP_I8(i) bytecode_data[pc + (i)]
-#define OP_I16(i)                           \
-  ((uint16_t)bytecode_data[pc + 0 + (i)]) | \
-      ((uint16_t)bytecode_data[pc + 1 + (i)] << 8)
-#define OP_I32(i)                                     \
-  ((uint32_t)bytecode_data[pc + 0 + (i)]) |           \
-      ((uint32_t)bytecode_data[pc + 1 + (i)] << 8) |  \
-      ((uint32_t)bytecode_data[pc + 2 + (i)] << 16) | \
-      ((uint32_t)bytecode_data[pc + 3 + (i)] << 24)
-#define OP_I64(i)                                     \
-  ((uint64_t)bytecode_data[pc + 0 + (i)]) |           \
-      ((uint64_t)bytecode_data[pc + 1 + (i)] << 8) |  \
-      ((uint64_t)bytecode_data[pc + 2 + (i)] << 16) | \
-      ((uint64_t)bytecode_data[pc + 3 + (i)] << 24) | \
-      ((uint64_t)bytecode_data[pc + 4 + (i)] << 32) | \
-      ((uint64_t)bytecode_data[pc + 5 + (i)] << 40) | \
-      ((uint64_t)bytecode_data[pc + 6 + (i)] << 48) | \
-      ((uint64_t)bytecode_data[pc + 7 + (i)] << 56)
-#error "TODO: OP_F32 and OP_F64 for big endian systems"
-#endif  // IREE_ENDIANNESS_LITTLE
+#define OP_I8(i) iree_unaligned_load_le((uint8_t*)&bytecode_data[pc + (i)])
+#define OP_I16(i) iree_unaligned_load_le((uint16_t*)&bytecode_data[pc + (i)])
+#define OP_I32(i) iree_unaligned_load_le((uint32_t*)&bytecode_data[pc + (i)])
+#define OP_I64(i) iree_unaligned_load_le((uint64_t*)&bytecode_data[pc + (i)])
+#define OP_F32(i) iree_unaligned_load_le((float*)&bytecode_data[pc + (i)])
+#define OP_F64(i) iree_unaligned_load_le((double*)&bytecode_data[pc + (i)])
 
 //===----------------------------------------------------------------------===//
 // Utilities matching the tablegen op encoding scheme