)]}'
{
  "commit": "3459230b8838dfd4f425f12984c21ad373ef08b4",
  "tree": "bcb302d049965667bdf08e6411c10c0372da3259",
  "parents": [
    "749453c2c3d4fa28f4f1bfed0ba00890f6e87b7a"
  ],
  "author": {
    "name": "bjacob",
    "email": "benoitjacob@google.com",
    "time": "Wed Jul 26 17:11:53 2023 -0400"
  },
  "committer": {
    "name": "GitHub",
    "email": "noreply@github.com",
    "time": "Wed Jul 26 17:11:53 2023 -0400"
  },
  "message": "Ukernels: mmt4d paths for the arm64 bf16 extension (#14495)\n\nThis extension is present on recent cores (Arm Cortex-A510/A710/X2) and\r\nhelps `bf16` matmuls accumulating into `f32` be up to 4x faster than\r\n`f32` matmuls thanks to not only the 2x smaller bit width, but also\r\nthanks to these new instructions being matrix multiplications. There are\r\ntwo instructions, `bfmmla` and `bfdot`. The `bfmmla` instruction is\r\ngenerally faster, and the Arm Reference Manual says it should always be\r\nfaster, but on the A510 it\u0027s actually slower. As we are not currently\r\ninto the business of writing code paths tuned for microarchitectures\r\n(beyond ISA capabilities), this PR just adds a `bfmmla` kernel, which\r\nperforms great on A710 and X2 and is still a speedup over f32 on the\r\nA510 even though it isn\u0027t optimal there.\r\n\r\nResults from `mmt4d_benchmark` (GFlop/s, single-thread):\r\n\r\nLHS/RHS/accumulator types | Arm Cortex-A510 | Arm Cortex-A710 | Arm\r\nCortex-X2\r\n--- | --- | --- | ---\r\n`f32f32f32` | 12.1 | 38.9 | 91.5\r\n`bf16bf16f32` | 13.3 | 152.4 | 314.4",
  "tree_diff": [
    {
      "type": "modify",
      "old_id": "3f0bffa9569758393139e64806dfc4c97c5126fc",
      "old_mode": 33188,
      "old_path": "runtime/src/iree/builtins/ukernel/arch/arm_64/BUILD.bazel",
      "new_id": "45c18741a3a7faf76a2b0fa66ce0e076c9ebb771",
      "new_mode": 33188,
      "new_path": "runtime/src/iree/builtins/ukernel/arch/arm_64/BUILD.bazel"
    },
    {
      "type": "modify",
      "old_id": "c245d6b86371e1101df17237461bcf8b48da1247",
      "old_mode": 33188,
      "old_path": "runtime/src/iree/builtins/ukernel/arch/arm_64/CMakeLists.txt",
      "new_id": "49cee5bee146c04fb5d0ac2a3f66db497eec3b70",
      "new_mode": 33188,
      "new_path": "runtime/src/iree/builtins/ukernel/arch/arm_64/CMakeLists.txt"
    },
    {
      "type": "modify",
      "old_id": "d3f4ad05e98d849c3f90bdd560599abd06200cb8",
      "old_mode": 33188,
      "old_path": "runtime/src/iree/builtins/ukernel/arch/arm_64/common_arm_64_entry_point.h",
      "new_id": "498c882b15fe3156bc9472e5d97d30cec1d0a18b",
      "new_mode": 33188,
      "new_path": "runtime/src/iree/builtins/ukernel/arch/arm_64/common_arm_64_entry_point.h"
    },
    {
      "type": "modify",
      "old_id": "d86c7763322d92c10f0bf83847cb846154b40dff",
      "old_mode": 33188,
      "old_path": "runtime/src/iree/builtins/ukernel/arch/arm_64/config_arm_64.h.in",
      "new_id": "5f9a75d2101a46fc3afa3285f2c983166221184d",
      "new_mode": 33188,
      "new_path": "runtime/src/iree/builtins/ukernel/arch/arm_64/config_arm_64.h.in"
    },
    {
      "type": "add",
      "old_id": "0000000000000000000000000000000000000000",
      "old_mode": 0,
      "old_path": "/dev/null",
      "new_id": "87248ba52910e7016cc2d206465d0d23d1a9b4fa",
      "new_mode": 33188,
      "new_path": "runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64_bf16.c"
    },
    {
      "type": "modify",
      "old_id": "599313d47f19da1ac52caa11f12f6e83fdcbe7ab",
      "old_mode": 33188,
      "old_path": "runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64_entry_point.c",
      "new_id": "326cd2184c72b275cbc3aa781e85ca4a6f9f9a72",
      "new_mode": 33188,
      "new_path": "runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64_entry_point.c"
    },
    {
      "type": "modify",
      "old_id": "91066f49057c9f32942109872c54f64c9bd947d5",
      "old_mode": 33188,
      "old_path": "runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64_internal.h",
      "new_id": "8dac035c1483696dc22eed3628328a9cbb0a8b7f",
      "new_mode": 33188,
      "new_path": "runtime/src/iree/builtins/ukernel/arch/arm_64/mmt4d_arm_64_internal.h"
    },
    {
      "type": "modify",
      "old_id": "04e63060b46448398c013c6ffff13d0bb44a5300",
      "old_mode": 33188,
      "old_path": "runtime/src/iree/builtins/ukernel/tools/mmt4d_benchmark.c",
      "new_id": "6a07957ed952533195e4be85c5b129b2b735a972",
      "new_mode": 33188,
      "new_path": "runtime/src/iree/builtins/ukernel/tools/mmt4d_benchmark.c"
    },
    {
      "type": "modify",
      "old_id": "30728124ca43911afc3e07705143ba27d873ef3c",
      "old_mode": 33188,
      "old_path": "runtime/src/iree/builtins/ukernel/tools/mmt4d_test.c",
      "new_id": "6c4c11b8cc311579770602c32db61a284ea9e98d",
      "new_mode": 33188,
      "new_path": "runtime/src/iree/builtins/ukernel/tools/mmt4d_test.c"
    }
  ]
}
