)]}'
{
  "commit": "f4d1908e89fece7e0bf5cebd606b69a9483817f3",
  "tree": "872e5fcb34d591273dcf19ccd3d6e2b24723b42e",
  "parents": [
    "7098bdfec0fa62ce250e9a99b0977dbb86b4fcf0"
  ],
  "author": {
    "name": "Zhewen Yu",
    "email": "zhewenyu@amd.com",
    "time": "Fri May 01 22:48:38 2026 +0100"
  },
  "committer": {
    "name": "GitHub",
    "email": "noreply@github.com",
    "time": "Fri May 01 22:48:38 2026 +0100"
  },
  "message": "[Codegen][DMA] Fix unaligned swizzle offset computation in gather-to-lds lowering (#24241)\n\nThe inverse XOR swizzle applied to DMA source offsets was incorrect in\ntwo cases:\n\n1. **Subgroup base offset**: When a subgroup\u0027s transfer size is not a\nmultiple of the swizzle period, different subgroups sharing the same\nlocal offsets but occupying different rows would get identical swizzled\naddresses.\n\nFix: incorporate the subgroup\u0027s base offset within the full allocation\nbefore swizzling.\n\n2. **Access-width alignment**: When `elementsPerLane \u003c accessWidth`, the\ninteger division inside `swizzleOffset` truncates offsets that differ\nonly within an access-width group to the same value.\n\nFix: strip the sub-accesswidth remainder before swizzling and restore it\nafter. This fix is applied directly in `swizzleOffset` for both XOR and\nrotate_rows swizzles. While rotate_rows isn\u0027t currently used with DMA,\nthe access-width alignment issue affects both swizzle types.\n\nBoth issues caused numerical mismatches for BF16 batch matmuls using DMA\nwith XOR swizzle enabled.\n\nAssisted-by: Cursor (Claude)\n\n---------\n\nSigned-off-by: Yu-Zhewen \u003czhewenyu@amd.com\u003e",
  "tree_diff": [
    {
      "type": "modify",
      "old_id": "64fe2f4afb4c1be04b08ddb97289663c6b6efec9",
      "old_mode": 33188,
      "old_path": "compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPULowerCoalescedDMAToGatherLDS.cpp",
      "new_id": "c46fafd2ebe59f8e02486ea3dc6cec0fa6f00a04",
      "new_mode": 33188,
      "new_path": "compiler/src/iree/compiler/Codegen/Common/GPU/AMDGPULowerCoalescedDMAToGatherLDS.cpp"
    },
    {
      "type": "modify",
      "old_id": "4b77f00e3d0ed360d85ff2f8b4806757c574faf6",
      "old_mode": 33188,
      "old_path": "compiler/src/iree/compiler/Codegen/Common/GPU/test/amdgpu_lower_coalesced_dma_to_gather_lds.mlir",
      "new_id": "0a032bacf6eda4709f6903268640333ddc3c741d",
      "new_mode": 33188,
      "new_path": "compiler/src/iree/compiler/Codegen/Common/GPU/test/amdgpu_lower_coalesced_dma_to_gather_lds.mlir"
    },
    {
      "type": "modify",
      "old_id": "ee7a9d7e132869bd77b765e8b871e606029a36a9",
      "old_mode": 33188,
      "old_path": "compiler/src/iree/compiler/Codegen/Common/test/resolve_swizzle_hints.mlir",
      "new_id": "0df2999df534da5de4aeb2c3ea2dbdb8fec479b0",
      "new_mode": 33188,
      "new_path": "compiler/src/iree/compiler/Codegen/Common/test/resolve_swizzle_hints.mlir"
    },
    {
      "type": "modify",
      "old_id": "46570c915c2b69190e146c09d4c632db242102b7",
      "old_mode": 33188,
      "old_path": "compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp",
      "new_id": "9b0bade3432768848551addb20b016101fd6a030",
      "new_mode": 33188,
      "new_path": "compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp"
    },
    {
      "type": "modify",
      "old_id": "59176ea506431ed414bb6b8b2be67619aa9b84d2",
      "old_mode": 33188,
      "old_path": "compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td",
      "new_id": "f692a93e5fadc9a993b6e34415d7bec180a7279e",
      "new_mode": 33188,
      "new_path": "compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td"
    },
    {
      "type": "modify",
      "old_id": "19603bc5e96f426a95ee78a1155b4372ff15fe71",
      "old_mode": 33188,
      "old_path": "compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp",
      "new_id": "e1adfa543badb4e8ed31f6cbedf76c3892191c7d",
      "new_mode": 33188,
      "new_path": "compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp"
    },
    {
      "type": "modify",
      "old_id": "a39e09558606d93771c8de87c24d751a289e49e6",
      "old_mode": 33188,
      "old_path": "compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h",
      "new_id": "631e0207670d1cdfa9371c4deccbcfa42f544431",
      "new_mode": 33188,
      "new_path": "compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h"
    },
    {
      "type": "modify",
      "old_id": "8b4e15c562ce0634eebb24c47de29c54a5ddeb85",
      "old_mode": 33188,
      "old_path": "tests/e2e/matmul/CMakeLists.txt",
      "new_id": "e89d5337d63ab322017c1e70b04267ebdfd2ccd0",
      "new_mode": 33188,
      "new_path": "tests/e2e/matmul/CMakeLists.txt"
    },
    {
      "type": "modify",
      "old_id": "08a3d88ee932b587464571505ab411fd721fa11b",
      "old_mode": 33188,
      "old_path": "tests/e2e/matmul/generate_e2e_batch_matmul_tests.py",
      "new_id": "80d9eb0fa80ee4abc480a1dcd2ba83c664a67a22",
      "new_mode": 33188,
      "new_path": "tests/e2e/matmul/generate_e2e_batch_matmul_tests.py"
    }
  ]
}
