Implement CPU feature detection on x86 (CPUID) (#12440)

diff --git a/runtime/src/iree/base/internal/cpu.c b/runtime/src/iree/base/internal/cpu.c
index ff5af87..3bc25f1 100644
--- a/runtime/src/iree/base/internal/cpu.c
+++ b/runtime/src/iree/base/internal/cpu.c
@@ -16,7 +16,7 @@
 // Platform-specific processor data queries
 //===----------------------------------------------------------------------===//
 
-#define iree_copy_bits(dst_val, dst_mask, src_val, src_mask) \
+#define IREE_COPY_BITS(dst_val, dst_mask, src_val, src_mask) \
   ((dst_val) |= (iree_all_bits_set((src_val), (src_mask)) ? (dst_mask) : 0))
 
 #if defined(IREE_ARCH_ARM_64)
@@ -40,9 +40,9 @@
   uint32_t hwcap = getauxval(AT_HWCAP);
   uint32_t hwcap2 = getauxval(AT_HWCAP2);
   uint64_t out0 = 0;
-  iree_copy_bits(out0, IREE_CPU_DATA0_ARM_64_DOTPROD, hwcap,
+  IREE_COPY_BITS(out0, IREE_CPU_DATA0_ARM_64_DOTPROD, hwcap,
                  IREE_HWCAP_ASIMDDP);
-  iree_copy_bits(out0, IREE_CPU_DATA0_ARM_64_I8MM, hwcap2, IREE_HWCAP2_I8MM);
+  IREE_COPY_BITS(out0, IREE_CPU_DATA0_ARM_64_I8MM, hwcap2, IREE_HWCAP2_I8MM);
   out_fields[0] = out0;
 }
 
@@ -75,12 +75,146 @@
 
 #endif  // IREE_PLATFORM_*
 
+#elif defined(IREE_ARCH_X86_64)
+
+#if defined(__GNUC__)
+#include <cpuid.h>
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+typedef struct iree_cpuid_regs_t {
+  uint32_t eax;
+  uint32_t ebx;
+  uint32_t ecx;
+  uint32_t edx;
+} iree_cpuid_regs_t;
+
+static inline iree_cpuid_regs_t iree_cpuid_raw(uint32_t eax, uint32_t ecx) {
+  iree_cpuid_regs_t regs;
+#if defined(__GNUC__)
+  __cpuid_count(eax, ecx, regs.eax, regs.ebx, regs.ecx, regs.edx);
+#elif defined(_MSC_VER)
+  int regs_array[4];
+  __cpuidex(regs_array, (int)eax, (int)ecx);
+  regs.eax = regs_array[0];
+  regs.ebx = regs_array[1];
+  regs.ecx = regs_array[2];
+  regs.edx = regs_array[3];
+#else
+#error What's the __cpuidex built-in for this compiler?
+#endif
+  return regs;
+}
+
+typedef struct iree_cpuid_bounds_t {
+  uint32_t max_base_eax;
+  uint32_t max_extended_eax;
+} iree_cpuid_bounds_t;
+
+static inline iree_cpuid_bounds_t iree_cpuid_query_bounds() {
+  iree_cpuid_bounds_t bounds;
+  bounds.max_base_eax = iree_cpuid_raw(0, 0).eax;
+  bounds.max_extended_eax = iree_cpuid_raw(0x80000000u, 0).eax;
+  if (bounds.max_extended_eax < 0x80000000u) bounds.max_extended_eax = 0;
+  return bounds;
+}
+
+static inline bool iree_cpuid_is_in_range(uint32_t eax, uint32_t ecx,
+                                          iree_cpuid_bounds_t bounds) {
+  if (eax < 0x80000000u) {
+    // EAX is a base function id.
+    if (eax > bounds.max_base_eax) return false;
+  } else {
+    // EAX is an extended function id.
+    if (eax > bounds.max_extended_eax) return false;
+  }
+  if (ecx) {
+    // ECX is a nonzero sub-function id.
+    uint32_t max_ecx = iree_cpuid_raw(eax, 0).eax;
+    if (ecx > max_ecx) return false;
+  }
+  return true;
+}
+
+static inline iree_cpuid_regs_t iree_cpuid_or_zero(uint32_t eax, uint32_t ecx,
+                                                   iree_cpuid_bounds_t bounds) {
+  if (!iree_cpuid_is_in_range(eax, ecx, bounds)) {
+    return (iree_cpuid_regs_t){0, 0, 0, 0};
+  }
+  return iree_cpuid_raw(eax, ecx);
+}
+
+static void iree_cpu_initialize_from_platform_x86_64(uint64_t* out_fields) {
+  iree_cpuid_bounds_t bounds = iree_cpuid_query_bounds();
+  iree_cpuid_regs_t leaf1 = iree_cpuid_or_zero(1, 0, bounds);
+  iree_cpuid_regs_t leaf7_0 = iree_cpuid_or_zero(7, 0, bounds);
+  iree_cpuid_regs_t leaf7_1 = iree_cpuid_or_zero(7, 1, bounds);
+  iree_cpuid_regs_t leafD = iree_cpuid_or_zero(0xD, 0, bounds);
+  iree_cpuid_regs_t leafExt1 = iree_cpuid_or_zero(0x80000001u, 0, bounds);
+
+  // Bits are given by bit position not by hex value because this is how they
+  // are described in the Intel Architectures Software Developer's Manual,
+  // Table 3-8, "Information Returned by CPUID Instruction".
+
+  uint64_t out0 = 0;
+  IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_SSE3, leaf1.ecx, 1 << 0);
+  IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_SSSE3, leaf1.ecx, 1 << 9);
+  IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_SSE41, leaf1.ecx, 1 << 19);
+  IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_SSE42, leaf1.ecx, 1 << 20);
+  IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_SSE4A, leafExt1.ecx, 1 << 6);
+
+  // Features that depend on YMM registers being enabled by the OS.
+  if (iree_all_bits_set(leafD.eax, 0x7)) {
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX, leaf1.ecx, 1 << 28);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_FMA3, leaf1.ecx, 1 << 12);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_FMA4, leafExt1.ecx, 1 << 16);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_XOP, leafExt1.ecx, 1 << 11);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_F16C, leaf1.ecx, 1 << 29);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX2, leaf7_0.ebx, 1 << 5);
+  }
+
+  // Features that depend on ZMM registers being enabled by the OS.
+  if (iree_all_bits_set(leafD.eax, 0xE7)) {
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512F, leaf7_0.ebx, 1 << 16);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512CD, leaf7_0.ebx, 1 << 28);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512VL, leaf7_0.ebx, 1u << 31);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512DQ, leaf7_0.ebx, 1 << 17);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512BW, leaf7_0.ebx, 1 << 30);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512IFMA, leaf7_0.ebx,
+                   1 << 21);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512VBMI, leaf7_0.ecx, 1 << 1);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512VPOPCNTDQ, leaf7_0.ecx,
+                   1 << 14);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512VNNI, leaf7_0.ecx,
+                   1 << 11);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512VBMI2, leaf7_0.ecx,
+                   1 << 6);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512BITALG, leaf7_0.ecx,
+                   1 << 12);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512BF16, leaf7_1.eax, 1 << 5);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512FP16, leaf7_0.edx,
+                   1 << 23);
+  }
+
+  // Features that depend on AMX TILE state being enabled by the OS.
+  if (iree_all_bits_set(leafD.eax, 0x60000)) {
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AMXTILE, leaf7_0.edx, 1 << 24);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AMXINT8, leaf7_0.edx, 1 << 25);
+    IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AMXBF16, leaf7_0.edx, 1 << 22);
+  }
+
+  out_fields[0] = out0;
+}
+
 #endif  // defined(IREE_ARCH_ARM_64)
 
 static void iree_cpu_initialize_from_platform(iree_allocator_t temp_allocator,
                                               uint64_t* out_fields) {
 #if defined(IREE_ARCH_ARM_64)
   iree_cpu_initialize_from_platform_arm_64(out_fields);
+#elif defined(IREE_ARCH_X86_64)
+  iree_cpu_initialize_from_platform_x86_64(out_fields);
 #else
   // No implementation available. CPU data will be all zeros.
 #endif  // defined(IREE_ARCH_ARM_64)
diff --git a/runtime/src/iree/schemas/cpu_data.h b/runtime/src/iree/schemas/cpu_data.h
index 23d84a7..50a4ab3 100644
--- a/runtime/src/iree/schemas/cpu_data.h
+++ b/runtime/src/iree/schemas/cpu_data.h
@@ -74,6 +74,45 @@
   IREE_CPU_DATA0_ARM_64_DOTPROD = 1ull << 0,
   IREE_CPU_DATA0_ARM_64_I8MM = 1ull << 1,
 
+  //===--------------------------------------------------------------------===//
+  // IREE_ARCH_X86_64 / x86-64
+  //===--------------------------------------------------------------------===//
+
+  // SSE features. Note: SSE and SSE2 are mandatory parts of X86-64.
+  IREE_CPU_DATA0_X86_64_SSE3 = 1ull << 0,
+  IREE_CPU_DATA0_X86_64_SSSE3 = 1ull << 1,
+  IREE_CPU_DATA0_X86_64_SSE41 = 1ull << 2,
+  IREE_CPU_DATA0_X86_64_SSE42 = 1ull << 3,
+  IREE_CPU_DATA0_X86_64_SSE4A = 1ull << 4,
+
+  // AVX features.
+  IREE_CPU_DATA0_X86_64_AVX = 1ull << 10,
+  IREE_CPU_DATA0_X86_64_FMA3 = 1ull << 11,
+  IREE_CPU_DATA0_X86_64_FMA4 = 1ull << 12,
+  IREE_CPU_DATA0_X86_64_XOP = 1ull << 13,
+  IREE_CPU_DATA0_X86_64_F16C = 1ull << 14,
+  IREE_CPU_DATA0_X86_64_AVX2 = 1ull << 15,
+
+  // AVX-512 features.
+  IREE_CPU_DATA0_X86_64_AVX512F = 1ull << 20,
+  IREE_CPU_DATA0_X86_64_AVX512CD = 1ull << 21,
+  IREE_CPU_DATA0_X86_64_AVX512VL = 1ull << 22,
+  IREE_CPU_DATA0_X86_64_AVX512DQ = 1ull << 23,
+  IREE_CPU_DATA0_X86_64_AVX512BW = 1ull << 24,
+  IREE_CPU_DATA0_X86_64_AVX512IFMA = 1ull << 25,
+  IREE_CPU_DATA0_X86_64_AVX512VBMI = 1ull << 26,
+  IREE_CPU_DATA0_X86_64_AVX512VPOPCNTDQ = 1ull << 27,
+  IREE_CPU_DATA0_X86_64_AVX512VNNI = 1ull << 28,
+  IREE_CPU_DATA0_X86_64_AVX512VBMI2 = 1ull << 29,
+  IREE_CPU_DATA0_X86_64_AVX512BITALG = 1ull << 30,
+  IREE_CPU_DATA0_X86_64_AVX512BF16 = 1ull << 31,
+  IREE_CPU_DATA0_X86_64_AVX512FP16 = 1ull << 32,
+
+  // AMX features.
+  IREE_CPU_DATA0_X86_64_AMXTILE = 1ull << 50,
+  IREE_CPU_DATA0_X86_64_AMXINT8 = 1ull << 51,
+  IREE_CPU_DATA0_X86_64_AMXBF16 = 1ull << 52,
+
 };
 
 #endif  // IREE_SCHEMAS_CPU_DATA_H_