Implement CPU feature detection on x86 (CPUID) (#12440)
diff --git a/runtime/src/iree/base/internal/cpu.c b/runtime/src/iree/base/internal/cpu.c index ff5af87..3bc25f1 100644 --- a/runtime/src/iree/base/internal/cpu.c +++ b/runtime/src/iree/base/internal/cpu.c
@@ -16,7 +16,7 @@ // Platform-specific processor data queries //===----------------------------------------------------------------------===// -#define iree_copy_bits(dst_val, dst_mask, src_val, src_mask) \ +#define IREE_COPY_BITS(dst_val, dst_mask, src_val, src_mask) \ ((dst_val) |= (iree_all_bits_set((src_val), (src_mask)) ? (dst_mask) : 0)) #if defined(IREE_ARCH_ARM_64) @@ -40,9 +40,9 @@ uint32_t hwcap = getauxval(AT_HWCAP); uint32_t hwcap2 = getauxval(AT_HWCAP2); uint64_t out0 = 0; - iree_copy_bits(out0, IREE_CPU_DATA0_ARM_64_DOTPROD, hwcap, + IREE_COPY_BITS(out0, IREE_CPU_DATA0_ARM_64_DOTPROD, hwcap, IREE_HWCAP_ASIMDDP); - iree_copy_bits(out0, IREE_CPU_DATA0_ARM_64_I8MM, hwcap2, IREE_HWCAP2_I8MM); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_ARM_64_I8MM, hwcap2, IREE_HWCAP2_I8MM); out_fields[0] = out0; } @@ -75,12 +75,146 @@ #endif // IREE_PLATFORM_* +#elif defined(IREE_ARCH_X86_64) + +#if defined(__GNUC__) +#include <cpuid.h> +#elif defined(_MSC_VER) +#include <intrin.h> +#endif + +typedef struct iree_cpuid_regs_t { + uint32_t eax; + uint32_t ebx; + uint32_t ecx; + uint32_t edx; +} iree_cpuid_regs_t; + +static inline iree_cpuid_regs_t iree_cpuid_raw(uint32_t eax, uint32_t ecx) { + iree_cpuid_regs_t regs; +#if defined(__GNUC__) + __cpuid_count(eax, ecx, regs.eax, regs.ebx, regs.ecx, regs.edx); +#elif defined(_MSC_VER) + int regs_array[4]; + __cpuidex(regs_array, (int)eax, (int)ecx); + regs.eax = regs_array[0]; + regs.ebx = regs_array[1]; + regs.ecx = regs_array[2]; + regs.edx = regs_array[3]; +#else +#error What's the __cpuidex built-in for this compiler? +#endif + return regs; +} + +typedef struct iree_cpuid_bounds_t { + uint32_t max_base_eax; + uint32_t max_extended_eax; +} iree_cpuid_bounds_t; + +static inline iree_cpuid_bounds_t iree_cpuid_query_bounds() { + iree_cpuid_bounds_t bounds; + bounds.max_base_eax = iree_cpuid_raw(0, 0).eax; + bounds.max_extended_eax = iree_cpuid_raw(0x80000000u, 0).eax; + if (bounds.max_extended_eax < 0x80000000u) bounds.max_extended_eax = 0; + return bounds; +} + +static inline bool iree_cpuid_is_in_range(uint32_t eax, uint32_t ecx, + iree_cpuid_bounds_t bounds) { + if (eax < 0x80000000u) { + // EAX is a base function id. + if (eax > bounds.max_base_eax) return false; + } else { + // EAX is an extended function id. + if (eax > bounds.max_extended_eax) return false; + } + if (ecx) { + // ECX is a nonzero sub-function id. + uint32_t max_ecx = iree_cpuid_raw(eax, 0).eax; + if (ecx > max_ecx) return false; + } + return true; +} + +static inline iree_cpuid_regs_t iree_cpuid_or_zero(uint32_t eax, uint32_t ecx, + iree_cpuid_bounds_t bounds) { + if (!iree_cpuid_is_in_range(eax, ecx, bounds)) { + return (iree_cpuid_regs_t){0, 0, 0, 0}; + } + return iree_cpuid_raw(eax, ecx); +} + +static void iree_cpu_initialize_from_platform_x86_64(uint64_t* out_fields) { + iree_cpuid_bounds_t bounds = iree_cpuid_query_bounds(); + iree_cpuid_regs_t leaf1 = iree_cpuid_or_zero(1, 0, bounds); + iree_cpuid_regs_t leaf7_0 = iree_cpuid_or_zero(7, 0, bounds); + iree_cpuid_regs_t leaf7_1 = iree_cpuid_or_zero(7, 1, bounds); + iree_cpuid_regs_t leafD = iree_cpuid_or_zero(0xD, 0, bounds); + iree_cpuid_regs_t leafExt1 = iree_cpuid_or_zero(0x80000001u, 0, bounds); + + // Bits are given by bit position not by hex value because this is how they + // are described in the Intel Architectures Software Developer's Manual, + // Table 3-8, "Information Returned by CPUID Instruction". + + uint64_t out0 = 0; + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_SSE3, leaf1.ecx, 1 << 0); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_SSSE3, leaf1.ecx, 1 << 9); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_SSE41, leaf1.ecx, 1 << 19); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_SSE42, leaf1.ecx, 1 << 20); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_SSE4A, leafExt1.ecx, 1 << 6); + + // Features that depend on YMM registers being enabled by the OS. + if (iree_all_bits_set(leafD.eax, 0x7)) { + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX, leaf1.ecx, 1 << 28); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_FMA3, leaf1.ecx, 1 << 12); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_FMA4, leafExt1.ecx, 1 << 16); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_XOP, leafExt1.ecx, 1 << 11); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_F16C, leaf1.ecx, 1 << 29); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX2, leaf7_0.ebx, 1 << 5); + } + + // Features that depend on ZMM registers being enabled by the OS. + if (iree_all_bits_set(leafD.eax, 0xE7)) { + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512F, leaf7_0.ebx, 1 << 16); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512CD, leaf7_0.ebx, 1 << 28); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512VL, leaf7_0.ebx, 1u << 31); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512DQ, leaf7_0.ebx, 1 << 17); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512BW, leaf7_0.ebx, 1 << 30); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512IFMA, leaf7_0.ebx, + 1 << 21); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512VBMI, leaf7_0.ecx, 1 << 1); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512VPOPCNTDQ, leaf7_0.ecx, + 1 << 14); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512VNNI, leaf7_0.ecx, + 1 << 11); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512VBMI2, leaf7_0.ecx, + 1 << 6); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512BITALG, leaf7_0.ecx, + 1 << 12); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512BF16, leaf7_1.eax, 1 << 5); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AVX512FP16, leaf7_0.edx, + 1 << 23); + } + + // Features that depend on AMX TILE state being enabled by the OS. + if (iree_all_bits_set(leafD.eax, 0x60000)) { + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AMXTILE, leaf7_0.edx, 1 << 24); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AMXINT8, leaf7_0.edx, 1 << 25); + IREE_COPY_BITS(out0, IREE_CPU_DATA0_X86_64_AMXBF16, leaf7_0.edx, 1 << 22); + } + + out_fields[0] = out0; +} + #endif // defined(IREE_ARCH_ARM_64) static void iree_cpu_initialize_from_platform(iree_allocator_t temp_allocator, uint64_t* out_fields) { #if defined(IREE_ARCH_ARM_64) iree_cpu_initialize_from_platform_arm_64(out_fields); +#elif defined(IREE_ARCH_X86_64) + iree_cpu_initialize_from_platform_x86_64(out_fields); #else // No implementation available. CPU data will be all zeros. #endif // defined(IREE_ARCH_ARM_64)
diff --git a/runtime/src/iree/schemas/cpu_data.h b/runtime/src/iree/schemas/cpu_data.h index 23d84a7..50a4ab3 100644 --- a/runtime/src/iree/schemas/cpu_data.h +++ b/runtime/src/iree/schemas/cpu_data.h
@@ -74,6 +74,45 @@ IREE_CPU_DATA0_ARM_64_DOTPROD = 1ull << 0, IREE_CPU_DATA0_ARM_64_I8MM = 1ull << 1, + //===--------------------------------------------------------------------===// + // IREE_ARCH_X86_64 / x86-64 + //===--------------------------------------------------------------------===// + + // SSE features. Note: SSE and SSE2 are mandatory parts of X86-64. + IREE_CPU_DATA0_X86_64_SSE3 = 1ull << 0, + IREE_CPU_DATA0_X86_64_SSSE3 = 1ull << 1, + IREE_CPU_DATA0_X86_64_SSE41 = 1ull << 2, + IREE_CPU_DATA0_X86_64_SSE42 = 1ull << 3, + IREE_CPU_DATA0_X86_64_SSE4A = 1ull << 4, + + // AVX features. + IREE_CPU_DATA0_X86_64_AVX = 1ull << 10, + IREE_CPU_DATA0_X86_64_FMA3 = 1ull << 11, + IREE_CPU_DATA0_X86_64_FMA4 = 1ull << 12, + IREE_CPU_DATA0_X86_64_XOP = 1ull << 13, + IREE_CPU_DATA0_X86_64_F16C = 1ull << 14, + IREE_CPU_DATA0_X86_64_AVX2 = 1ull << 15, + + // AVX-512 features. + IREE_CPU_DATA0_X86_64_AVX512F = 1ull << 20, + IREE_CPU_DATA0_X86_64_AVX512CD = 1ull << 21, + IREE_CPU_DATA0_X86_64_AVX512VL = 1ull << 22, + IREE_CPU_DATA0_X86_64_AVX512DQ = 1ull << 23, + IREE_CPU_DATA0_X86_64_AVX512BW = 1ull << 24, + IREE_CPU_DATA0_X86_64_AVX512IFMA = 1ull << 25, + IREE_CPU_DATA0_X86_64_AVX512VBMI = 1ull << 26, + IREE_CPU_DATA0_X86_64_AVX512VPOPCNTDQ = 1ull << 27, + IREE_CPU_DATA0_X86_64_AVX512VNNI = 1ull << 28, + IREE_CPU_DATA0_X86_64_AVX512VBMI2 = 1ull << 29, + IREE_CPU_DATA0_X86_64_AVX512BITALG = 1ull << 30, + IREE_CPU_DATA0_X86_64_AVX512BF16 = 1ull << 31, + IREE_CPU_DATA0_X86_64_AVX512FP16 = 1ull << 32, + + // AMX features. + IREE_CPU_DATA0_X86_64_AMXTILE = 1ull << 50, + IREE_CPU_DATA0_X86_64_AMXINT8 = 1ull << 51, + IREE_CPU_DATA0_X86_64_AMXBF16 = 1ull << 52, + }; #endif // IREE_SCHEMAS_CPU_DATA_H_