Fixes and simplifications to CPU-data handling (#13881)

In DispatchABI.cpp:
* Was not checking if CPU feature bits in the schema applied to the
target architecture.
* Was not checking if CPU feature bits belonged to field 0 even though
the current code only handles that. Other fields are currently unused
but at least now we will get an assert failure if that changes.
* Was building a vector of bits, then creating one `LLVM::OrOp`for each
bit. Instead, why not OR those bits in C++ and create a single
`LLVM::OrOp` with the resulting OR-ed value?
* Was not checking that the first character of each feature string was a
`'+'` before dropping it.
* Was room for simplification in how the schema was used.

Additionally, folding into this PR a simplification to `iree-cpuinfo` (a
standalone tool to report the host CPU's features exactly as detected by
IREE). It was quadratic-time, as it unnecessarily called
`iree_cpu_lookup_data_by_key` for each feature.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp
index 25c2ffb..39709b2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp
@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Codegen/LLVMCPU/DispatchABI.h"
 
+#include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/schemas/cpu_data.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Support/CommandLine.h"
@@ -24,39 +25,6 @@
 namespace mlir {
 namespace iree_compiler {
 
-// List of all defined llvm feature-name to bit pattern used to represent it.
-// This is derived based on the schema in `runtime/src/iree/schemas/`.
-// TODO(ravishankarm): This link to the runtime schemas needs to be broken.
-// Instead we should use a reflection callback to resolve arch guarded features
-// directly in the compiler.
-
-// Struct to capture tuple of llvm feature-name to bit pattern used to represent
-// it.
-struct iree_llvm_name_and_bit_pattern_t {
-  const char *llvm_name;
-  unsigned long long bit_pattern;
-};
-
-#define IREE_CPU_FEATURE_BIT_NAME(arch, field_index, bit_name) \
-  IREE_CPU_DATA##field_index##_##arch##_##bit_name
-
-#define IREE_CPU_FEATURE_NAME_AND_BIT_PATTERN(arch, field_index, bit_name, \
-                                              llvm_name)                   \
-  {llvm_name, IREE_CPU_FEATURE_BIT_NAME(arch, field_index, bit_name)},
-
-static const struct iree_llvm_name_and_bit_pattern_t
-    iree_llvm_name_and_bit_pattern_list[] = {
-
-#define IREE_CPU_FEATURE_BIT(arch, field_index, bit_pos, bit_name, llvm_name) \
-  IREE_CPU_FEATURE_NAME_AND_BIT_PATTERN(arch, field_index, bit_name, llvm_name)
-#include "iree/schemas/cpu_feature_bits.inl"
-#undef IREE_CPU_FEATURE_BIT
-
-};
-
-#undef IREE_CPU_FEATURE_NAME_AND_BIT_PATTERN
-#undef IREE_CPU_FEATURE_BIT_NAME
-
 //------------------------------------------------------------------------------
 // ExecutableLibraryDI
 //------------------------------------------------------------------------------
@@ -885,24 +853,47 @@
     return processorDataPtrValue;
   }
 
-  SmallVector<uint64_t> specifiedFeatureBitPatterns;
+  // Currently requiring all CPU feature bits to be in field 0. Generalize as
+  // needed when other CPU feature fields start to be used.
+  uint64_t specifiedCpuDataField0 = 0;
   {
+    // Map llvm feature-name to bit used to represent it in IREE_CPUDATA_FIELD0.
+    //
+    // TODO(ravishankarm): This link to the runtime schemas needs to be broken.
+    // Instead we should use a reflection callback to resolve arch guarded
+    // features directly in the compiler.
     llvm::StringMap<uint64_t> featureToBitPattern;
-    for (auto [llvmName, bitPattern] : iree_llvm_name_and_bit_pattern_list) {
-      featureToBitPattern[llvmName] = bitPattern;
+    auto targetTriple = getTargetTriple(targetAttr);
+    if (!targetTriple) {
+      return processorDataPtrValue;
     }
+    std::string targetArchUppercase =
+        StringRef(getIreeArchNameForTargetTriple(targetTriple.value())).upper();
+#define IREE_CPU_FEATURE_BIT(arch, field_index, bit_pos, bit_name, llvm_name) \
+  if (targetArchUppercase == #arch) {                                         \
+    assert(field_index == 0);                                                 \
+    featureToBitPattern[llvm_name] = 1ull << bit_pos;                         \
+  }
+#include "iree/schemas/cpu_feature_bits.inl"
+#undef IREE_CPU_FEATURE_BIT
+
+    // Find CPU features in featureToBitPattern
     SmallVector<StringRef> cpuFeatureStrings;
     llvm::cast<StringAttr>(cpuFeatures->getValue())
         .getValue()
         .split(cpuFeatureStrings, ',', /*MakeSplit=*/-1, /*KeepEmpty=*/false);
     for (auto featureString : cpuFeatureStrings) {
-      if (featureToBitPattern.count(featureString.drop_front())) {
-        specifiedFeatureBitPatterns.push_back(
-            featureToBitPattern.lookup(featureString.drop_front()));
+      // CPU features are typically prefixed with a +, e.g. +avx,+avx2,+fma.
+      featureString.consume_front("+");
+      // Silently skip unknown CPU features, more flexible for now. Note that
+      // some featurs occurring here are not standard CPU features but internal
+      // things such as the "+reserve-x18" that we add on arm64.
+      if (featureToBitPattern.count(featureString)) {
+        specifiedCpuDataField0 |= featureToBitPattern.lookup(featureString);
       }
     }
   }
-  if (specifiedFeatureBitPatterns.empty()) {
+  if (specifiedCpuDataField0 == 0) {
     return processorDataPtrValue;
   }
 
@@ -919,11 +910,9 @@
   Value srcData0 =
       builder.create<LLVM::LoadOp>(loc, i64Ty, processorDataPtrValue);
   // Set the specified CPU arch data.
-  for (auto bitPattern : specifiedFeatureBitPatterns) {
-    Value bitPatternVal = builder.create<LLVM::ConstantOp>(
-        loc, i64Ty, builder.getI64IntegerAttr(bitPattern));
-    srcData0 = builder.create<LLVM::OrOp>(loc, srcData0, bitPatternVal);
-  }
+  Value bitPatternVal = builder.create<LLVM::ConstantOp>(
+      loc, i64Ty, builder.getI64IntegerAttr(specifiedCpuDataField0));
+  srcData0 = builder.create<LLVM::OrOp>(loc, srcData0, bitPatternVal);
   builder.create<LLVM::StoreOp>(loc, srcData0, alloca);
   // Copy over the rest.
   for (int64_t i = 1, e = ProcessorDataCapacity; i < e; ++i) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir
index 7acae65..3a3cb25 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir
@@ -102,13 +102,7 @@
 }
 //      CHECK: llvm.func @paramstruct_cconv_with_extra_fields_and_executable_target(!llvm.ptr)
 //      CHECK: llvm.func @bar
-//  CHECK-DAG:   %[[F16C:.+]] = llvm.mlir.constant(16384 : i64) : i64
-//  CHECK-DAG:   %[[FMA:.+]] = llvm.mlir.constant(2048 : i64) : i64
-//  CHECK-DAG:   %[[AVX2:.+]] = llvm.mlir.constant(32768 : i64) : i64
-//  CHECK-DAG:   %[[AVX:.+]] = llvm.mlir.constant(1024 : i64) : i64
-//  CHECK-DAG:   %[[SSE41:.+]] = llvm.mlir.constant(4 : i64) : i64
-//  CHECK-DAG:   %[[SSSE3:.+]] = llvm.mlir.constant(2 : i64) : i64
-//  CHECK-DAG:   %[[SSE3:.+]] = llvm.mlir.constant(1 : i64) : i64
+//  CHECK-DAG:   %[[CPUDATA_FIELD0:.+]] = llvm.mlir.constant(52239 : i64) : i64
 //  CHECK-DAG:   %[[C8:.+]] = llvm.mlir.constant(8 : i64) : i64
 //  CHECK-DAG:   %[[C1:.+]] = llvm.mlir.constant(1 : index) : i64
 //  CHECK-DAG:   %[[Ci32:.+]] = llvm.mlir.constant(42 : i32) : i32
@@ -117,15 +111,8 @@
 //  CHECK-DAG:   %[[DATA_PTR:.+]] = llvm.getelementptr inbounds %arg0[4]
 //      CHECK:   %[[PROCESSOR_DATA_ALLOCA:.+]] = llvm.alloca %[[C8]] x i64 {alignment = 8 : i64}
 //  CHECK-DAG:   %[[DATA:.+]] = llvm.load %[[DATA_PTR]]
-//  CHECK-DAG:   %[[OR0:.+]] = llvm.or %[[DATA]], %[[SSE3]]
-//  CHECK-DAG:   %[[OR1:.+]] = llvm.or %[[OR0]], %[[SSSE3]]
-//  CHECK-DAG:   %[[OR2:.+]] = llvm.or %[[OR1]], %[[SSE41]]
-//  CHECK-DAG:   %[[OR3:.+]] = llvm.or %[[OR2]], %[[C8]]
-//  CHECK-DAG:   %[[OR4:.+]] = llvm.or %[[OR3]], %[[AVX]]
-//  CHECK-DAG:   %[[OR5:.+]] = llvm.or %[[OR4]], %[[AVX2]]
-//  CHECK-DAG:   %[[OR6:.+]] = llvm.or %[[OR5]], %[[FMA]]
-//  CHECK-DAG:   %[[OR7:.+]] = llvm.or %[[OR6]], %[[F16C]]
-//      CHECK:   llvm.store %[[OR7]], %[[PROCESSOR_DATA_ALLOCA]]
+//  CHECK-DAG:   %[[OR0:.+]] = llvm.or %[[DATA]], %[[CPUDATA_FIELD0]]
+//      CHECK:   llvm.store %[[OR0]], %[[PROCESSOR_DATA_ALLOCA]]
 
 //      CHECK:   %[[PROCESSOR_DATA_PTR_1:.+]] = llvm.getelementptr inbounds %[[DATA_PTR]][1]
 //      CHECK:   %[[PROCESSOR_DATA_1:.+]] = llvm.load %[[PROCESSOR_DATA_PTR_1]]
diff --git a/tools/iree-cpuinfo.c b/tools/iree-cpuinfo.c
index d740465..f5324d0 100644
--- a/tools/iree-cpuinfo.c
+++ b/tools/iree-cpuinfo.c
@@ -11,11 +11,11 @@
 
 int main(int argc, char *argv[]) {
   iree_cpu_initialize(iree_allocator_system());
+  const uint64_t* cpu_data = iree_cpu_data_fields();
 
 #define IREE_CPU_FEATURE_BIT(arch, field_index, bit_pos, bit_name, llvm_name) \
   if (IREE_ARCH_ENUM == IREE_ARCH_ENUM_##arch) {                              \
-    int64_t result = 0;                                                       \
-    IREE_CHECK_OK(iree_cpu_lookup_data_by_key(IREE_SV(llvm_name), &result));  \
+    bool result = (cpu_data[field_index] & (1ull << bit_pos)) != 0;           \
     printf("%-20s %ld\n", llvm_name, result);                                 \
   }
 #include "iree/schemas/cpu_feature_bits.inl"