)]}'
{
  "commit": "8b85022c8f432f77ffd9d25eda7cc6884a2fdce8",
  "tree": "603fbf3d4e2b1eb4fe1526a473e21d268d7bbe8b",
  "parents": [
    "f629f2135ac709876644aeb5716d04ce15d49e44"
  ],
  "author": {
    "name": "Zhewen Yu",
    "email": "zhewenyu@amd.com",
    "time": "Tue Apr 21 17:24:29 2026 +0200"
  },
  "committer": {
    "name": "GitHub",
    "email": "noreply@github.com",
    "time": "Tue Apr 21 16:24:29 2026 +0100"
  },
  "message": "[Codegen] Enable DMA by default for F16/BF16 Gemm on gfx950 (#24117)\n\nThis PR flips the default DMA from false to true. Currently only BF16\nGEMM has been properly benchmarked, but going to expand other supports\nas a follow-up.\n\nOverall GEMM benchmark: **+5.9%** geomean speedup if TN (LHS transposed,\nRHS not transposed) cases are excluded. **+4.3%** if included.\n\n#### Top 20 improved\n\n| # | Op | Shape (MxNxK) | Transpose | NoDMA (us) | DMA (us) | Speedup |\n|---|-----|--------------|--------|------------|----------|---------|\n| 1 | mm | 7680x304x512 | NT | 14.4 | 9.7 | +32.7% |\n| 2 | mm | 1280x3840x3840 | NN | 84.0 | 59.5 | +29.2% |\n| 3 | mm | 24576x304x512 | NT | 33.4 | 23.8 | +28.6% |\n| 4 | addmm | 256x3840x3840 | NT | 39.2 | 28.0 | +28.6% |\n| 5 | mm | 1280x2304x576 | NT | 11.6 | 8.4 | +27.4% |\n| 6 | mm | 1280x3840x7680 | NN | 164.4 | 119.4 | +27.3% |\n| 7 | mm | 960x256x2048 | NT | 20.4 | 15.0 | +26.5% |\n| 8 | mm | 4096x2048x8192 | NN | 215.5 | 159.9 | +25.8% |\n| 9 | mm | 4096x2048x6144 | NN | 163.1 | 121.7 | +25.4% |\n| 10 | mm | 4096x2048x4096 | NN | 112.7 | 84.4 | +25.1% |\n| 11 | addmm | 5x384x384 | NT | 4.7 | 3.5 | +24.9% |\n| 12 | mm | 3072x256x2048 | NT | 21.2 | 16.1 | +24.1% |\n| 13 | addmm | 18928x128x512 | NT | 13.7 | 10.4 | +23.9% |\n| 14 | mm | 1285x8192x2048 | NN | 76.9 | 59.4 | +22.9% |\n| 15 | mm | 4112x2048x8192 | NN | 275.1 | 213.3 | +22.5% |\n| 16 | mm | 256x3840x3840 | NN | 41.7 | 32.4 | +22.2% |\n| 17 | mm | 1280x1152x576 | NT | 8.1 | 6.3 | +22.1% |\n| 18 | mm | 4096x2048x2048 | NN | 57.0 | 44.5 | +22.0% |\n| 19 | addmm | 1280x3840x3840 | NT | 83.9 | 65.5 | +22.0% |\n| 20 | mm | 1280x3840x3840 | NT | 83.7 | 66.0 | +21.1% |\n\n\u003cdetails\u003e\n\u003csummary\u003eReproduce commands (top 20 improved)\u003c/summary\u003e\n\n```\n# 1. mm 7680x304x512 NT (+32.7%)\naten::mm \u0027[[7680, 512], [512, 304]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[512, 1], [1, 512]]\u0027 \u0027[\"\", \"\"]\u0027\n# 2. mm 1280x3840x3840 NN (+29.2%)\naten::mm \u0027[[1280, 3840], [3840, 3840]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[3840, 1], [3840, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 3. mm 24576x304x512 NT (+28.6%)\naten::mm \u0027[[24576, 512], [512, 304]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[512, 1], [1, 512]]\u0027 \u0027[\"\", \"\"]\u0027\n# 4. addmm 256x3840x3840 NT (+28.6%)\naten::addmm \u0027[[3840], [256, 3840], [3840, 3840], [], []]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\", \"c10::BFloat16\", \"Scalar\", \"Scalar\"]\u0027 \u0027[[1], [3840, 1], [1, 3840], [], []]\u0027 \u0027[\"\", \"\", \"\", \"1\", \"1\"]\u0027\n# 5. mm 1280x2304x576 NT (+27.4%)\naten::mm \u0027[[1280, 576], [576, 2304]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[576, 1], [1, 576]]\u0027 \u0027[\"\", \"\"]\u0027\n# 6. mm 1280x3840x7680 NN (+27.3%)\naten::mm \u0027[[1280, 7680], [7680, 3840]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[7680, 1], [3840, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 7. mm 960x256x2048 NT (+26.5%)\naten::mm \u0027[[960, 2048], [2048, 256]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[2048, 1], [1, 2048]]\u0027 \u0027[\"\", \"\"]\u0027\n# 8. mm 4096x2048x8192 NN (+25.8%)\naten::mm \u0027[[4096, 8192], [8192, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[8192, 1], [2048, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 9. mm 4096x2048x6144 NN (+25.4%)\naten::mm \u0027[[4096, 6144], [6144, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[6144, 1], [2048, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 10. mm 4096x2048x4096 NN (+25.1%)\naten::mm \u0027[[4096, 4096], [4096, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[4096, 1], [2048, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 11. addmm 5x384x384 NT (+24.9%)\naten::addmm \u0027[[384], [5, 384], [384, 384], [], []]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\", \"c10::BFloat16\", \"Scalar\", \"Scalar\"]\u0027 \u0027[[1], [384, 1], [1, 384], [], []]\u0027 \u0027[\"\", \"\", \"\", \"1\", \"1\"]\u0027\n# 12. mm 3072x256x2048 NT (+24.1%)\naten::mm \u0027[[3072, 2048], [2048, 256]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[2048, 1], [1, 2048]]\u0027 \u0027[\"\", \"\"]\u0027\n# 13. addmm 18928x128x512 NT (+23.9%)\naten::addmm \u0027[[128], [18928, 512], [512, 128], [], []]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\", \"c10::BFloat16\", \"Scalar\", \"Scalar\"]\u0027 \u0027[[1], [512, 1], [1, 512], [], []]\u0027 \u0027[\"\", \"\", \"\", \"1\", \"1\"]\u0027\n# 14. mm 1285x8192x2048 NN (+22.9%)\naten::mm \u0027[[1285, 2048], [2048, 8192]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[2048, 1], [8192, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 15. mm 4112x2048x8192 NN (+22.5%)\naten::mm \u0027[[4112, 8192], [8192, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[8192, 1], [2048, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 16. mm 256x3840x3840 NN (+22.2%)\naten::mm \u0027[[256, 3840], [3840, 3840]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[3840, 1], [3840, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 17. mm 1280x1152x576 NT (+22.1%)\naten::mm \u0027[[1280, 576], [576, 1152]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[576, 1], [1, 576]]\u0027 \u0027[\"\", \"\"]\u0027\n# 18. mm 4096x2048x2048 NN (+22.0%)\naten::mm \u0027[[4096, 2048], [2048, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[2048, 1], [2048, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 19. addmm 1280x3840x3840 NT (+22.0%)\naten::addmm \u0027[[3840], [1280, 3840], [3840, 3840], [], []]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\", \"c10::BFloat16\", \"Scalar\", \"Scalar\"]\u0027 \u0027[[1], [3840, 1], [1, 3840], [], []]\u0027 \u0027[\"\", \"\", \"\", \"1\", \"1\"]\u0027\n# 20. mm 1280x3840x3840 NT (+21.1%)\naten::mm \u0027[[1280, 3840], [3840, 3840]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[3840, 1], [1, 3840]]\u0027 \u0027[\"\", \"\"]\u0027\n```\n\n\u003c/details\u003e\n\n#### Regressions\n\nThe regression cases are strongly correlated with TN (LHS transposed,\nRHS not transposed) layouts — 33 out of 83 total regressions are TN, and\nthey account for all of the severe cases (\u003e25%). This PR disables DMA\nfor TN cases, and they will be addressed in a follow-up.\n\n\u003cdetails\u003e\n\u003csummary\u003eTop 20 regressed — all layouts\u003c/summary\u003e\n\n| # | Op | Shape (MxNxK) | Transpose | NoDMA (us) | DMA (us) |\nRegression |\n|---|-----|--------------|--------|------------|----------|------------|\n| 1 | mm | 8192x2048x150000 | TN | 6039.3 | 14120.4 | -133.8% |\n| 2 | mm | 16384x4096x150000 | TN | 24019.5 | 52548.8 | -118.8% |\n| 3 | mm | 4096x16384x150000 | TN | 24946.4 | 38507.4 | -54.4% |\n| 4 | mm | 2048x8192x150000 | TN | 6307.2 | 9572.7 | -51.8% |\n| 5 | mm | 1024x128x150000 | TN | 190.4 | 262.3 | -37.8% |\n| 6 | mm | 2048x2048x1285 | TN | 27.1 | 36.2 | -33.3% |\n| 7 | mm | 128x128x2119936 | TN | 287.2 | 365.2 | -27.2% |\n| 8 | mm | 576x3840x7680 | TN | 111.6 | 140.7 | -26.1% |\n| 9 | mm | 1134x2048x150000 | TN | 3435.5 | 4313.4 | -25.6% |\n| 10 | mm | 20x3840x21760 | TN | 42.9 | 53.7 | -25.1% |\n| 11 | mm | 10x576x2304 | NN | 9.0 | 11.1 | -23.7% |\n| 12 | bmm | 32x96x21x96 | NT | 4.2 | 5.1 | -23.5% |\n| 13 | addmm | 5x4x384 | NT | 6.8 | 8.3 | -21.8% |\n| 14 | bmm | 32x21x96x96 | NT | 4.2 | 5.1 | -21.5% |\n| 15 | addmm | 150000x1134x2048 | NT | 1098.5 | 1303.2 | -18.6% |\n| 16 | mm | 576x2048x24576 | TN | 167.6 | 195.8 | -16.9% |\n| 17 | mm | 2048x512x24576 | TN | 142.9 | 167.0 | -16.8% |\n| 18 | mm | 1024x1024x16 | TN | 4.4 | 5.1 | -16.1% |\n| 19 | mm | 512x2048x24576 | TN | 136.1 | 157.9 | -16.0% |\n| 20 | mm | 5x384x4 | NN | 3.5 | 4.1 | -15.2% |\n\n\u003cdetails\u003e\n\u003csummary\u003eReproduce commands\u003c/summary\u003e\n\n```\n# 1. mm 8192x2048x150000 TN (-133.8%)\naten::mm \u0027[[8192, 150000], [150000, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[1, 8192], [2048, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 2. mm 16384x4096x150000 TN (-118.8%)\naten::mm \u0027[[16384, 150000], [150000, 4096]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[1, 16384], [4096, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 3. mm 4096x16384x150000 TN (-54.4%)\naten::mm \u0027[[4096, 150000], [150000, 16384]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[1, 4096], [16384, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 4. mm 2048x8192x150000 TN (-51.8%)\naten::mm \u0027[[2048, 150000], [150000, 8192]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[1, 2048], [8192, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 5. mm 1024x128x150000 TN (-37.8%)\naten::mm \u0027[[1024, 150000], [150000, 128]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[1, 1024], [128, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 6. mm 2048x2048x1285 TN (-33.3%)\naten::mm \u0027[[2048, 1285], [1285, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[1, 2048], [2048, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 7. mm 128x128x2119936 TN (-27.2%)\naten::mm \u0027[[128, 2119936], [2119936, 128]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[1, 128], [128, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 8. mm 576x3840x7680 TN (-26.1%)\naten::mm \u0027[[576, 7680], [7680, 3840]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[1, 576], [3840, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 9. mm 1134x2048x150000 TN (-25.6%)\naten::mm \u0027[[1134, 150000], [150000, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[1, 1134], [2048, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 10. mm 20x3840x21760 TN (-25.1%)\naten::mm \u0027[[20, 21760], [21760, 3840]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[1, 20], [3840, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 11. mm 10x576x2304 NN (-23.7%)\naten::mm \u0027[[10, 2304], [2304, 576]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[2304, 1], [576, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 12. bmm 32x96x21x96 (-23.5%)\naten::bmm \u0027[[32, 96, 96], [32, 96, 21]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[9216, 96, 1], [2016, 1, 96]]\u0027 \u0027[\"\", \"\"]\u0027\n# 13. addmm 5x4x384 NT (-21.8%)\naten::addmm \u0027[[4], [5, 384], [384, 4], [], []]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\", \"c10::BFloat16\", \"Scalar\", \"Scalar\"]\u0027 \u0027[[1], [384, 1], [1, 384], [], []]\u0027 \u0027[\"\", \"\", \"\", \"1\", \"1\"]\u0027\n# 14. bmm 32x21x96x96 (-21.5%)\naten::bmm \u0027[[32, 21, 96], [32, 96, 96]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[2016, 96, 1], [9216, 1, 96]]\u0027 \u0027[\"\", \"\"]\u0027\n# 15. addmm 150000x1134x2048 NT (-18.6%)\naten::addmm \u0027[[1134], [150000, 2048], [2048, 1134], [], []]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\", \"c10::BFloat16\", \"Scalar\", \"Scalar\"]\u0027 \u0027[[1], [2048, 1], [1, 2048], [], []]\u0027 \u0027[\"\", \"\", \"\", \"1\", \"1\"]\u0027\n# 16. mm 576x2048x24576 TN (-16.9%)\naten::mm \u0027[[576, 24576], [24576, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[1, 576], [2048, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 17. mm 2048x512x24576 TN (-16.8%)\naten::mm \u0027[[2048, 24576], [24576, 512]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[1, 2048], [512, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 18. mm 1024x1024x16 TN (-16.1%)\naten::mm \u0027[[1024, 16], [16, 1024]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[1, 1024], [1024, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 19. mm 512x2048x24576 TN (-16.0%)\naten::mm \u0027[[512, 24576], [24576, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[1, 512], [2048, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 20. mm 5x384x4 NN (-15.2%)\naten::mm \u0027[[5, 4], [4, 384]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[4, 1], [384, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n```\n\n\u003c/details\u003e\n\n\u003c/details\u003e\n\n\u003cdetails\u003e\n\u003csummary\u003eTop 20 regressed — excluding TN layouts\u003c/summary\u003e\n\n| # | Op | Shape (MxNxK) | Transpose | NoDMA (us) | DMA (us) |\nRegression |\n|---|-----|--------------|--------|------------|----------|------------|\n| 1 | mm | 10x576x2304 | NN | 9.0 | 11.1 | -23.7% |\n| 2 | bmm | 32x96x21x96 | NT | 4.2 | 5.1 | -23.5% |\n| 3 | addmm | 5x4x384 | NT | 6.8 | 8.3 | -21.8% |\n| 4 | bmm | 32x21x96x96 | NT | 4.2 | 5.1 | -21.5% |\n| 5 | addmm | 150000x1134x2048 | NT | 1098.5 | 1303.2 | -18.6% |\n| 6 | mm | 5x384x4 | NN | 3.5 | 4.1 | -15.2% |\n| 7 | mm | 1285x2048x8192 | NN | 154.9 | 178.0 | -14.9% |\n| 8 | mm | 1285x2048x3840 | NT | 65.0 | 74.0 | -13.9% |\n| 9 | mm | 16800000x134x128 | NN | 3879.0 | 4408.3 | -13.6% |\n| 10 | mm | 1285x2048x2048 | NN | 43.2 | 48.9 | -13.1% |\n| 11 | mm | 1285x2048x3840 | NN | 76.2 | 86.1 | -12.9% |\n| 12 | mm | 24576x512x2048 | NN | 102.8 | 115.2 | -12.1% |\n| 13 | bmm | 16x384x384x192 | NT | 6.4 | 7.1 | -10.9% |\n| 14 | bmm | 32x96x96x21 | NN | 4.0 | 4.4 | -10.8% |\n| 15 | bmm | 32x96x96x96 | NT | 4.5 | 5.0 | -10.4% |\n| 16 | bmm | 32x96x96x96 | NN | 4.5 | 4.9 | -9.4% |\n| 17 | mm | 528x2304x576 | NN | 9.2 | 10.1 | -8.9% |\n| 18 | bmm | 32x96x96x21 | NN | 6.5 | 7.0 | -8.6% |\n| 19 | addmm | 1285x2048x2048 | NT | 36.9 | 39.9 | -8.0% |\n| 20 | mm | 24576x2048x512 | NN | 106.6 | 114.7 | -7.7% |\n\n\u003cdetails\u003e\n\u003csummary\u003eReproduce commands\u003c/summary\u003e\n\n```\n# 1. mm 10x576x2304 NN (-23.7%)\naten::mm \u0027[[10, 2304], [2304, 576]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[2304, 1], [576, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 2. bmm 32x96x21x96 (-23.5%)\naten::bmm \u0027[[32, 96, 96], [32, 96, 21]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[9216, 96, 1], [2016, 1, 96]]\u0027 \u0027[\"\", \"\"]\u0027\n# 3. addmm 5x4x384 NT (-21.8%)\naten::addmm \u0027[[4], [5, 384], [384, 4], [], []]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\", \"c10::BFloat16\", \"Scalar\", \"Scalar\"]\u0027 \u0027[[1], [384, 1], [1, 384], [], []]\u0027 \u0027[\"\", \"\", \"\", \"1\", \"1\"]\u0027\n# 4. bmm 32x21x96x96 (-21.5%)\naten::bmm \u0027[[32, 21, 96], [32, 96, 96]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[2016, 96, 1], [9216, 1, 96]]\u0027 \u0027[\"\", \"\"]\u0027\n# 5. addmm 150000x1134x2048 NT (-18.6%)\naten::addmm \u0027[[1134], [150000, 2048], [2048, 1134], [], []]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\", \"c10::BFloat16\", \"Scalar\", \"Scalar\"]\u0027 \u0027[[1], [2048, 1], [1, 2048], [], []]\u0027 \u0027[\"\", \"\", \"\", \"1\", \"1\"]\u0027\n# 6. mm 5x384x4 NN (-15.2%)\naten::mm \u0027[[5, 4], [4, 384]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[4, 1], [384, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 7. mm 1285x2048x8192 NN (-14.9%)\naten::mm \u0027[[1285, 8192], [8192, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[8192, 1], [2048, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 8. mm 1285x2048x3840 NT (-13.9%)\naten::mm \u0027[[1285, 3840], [3840, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[3840, 1], [1, 3840]]\u0027 \u0027[\"\", \"\"]\u0027\n# 9. mm 16800000x134x128 NN (-13.6%)\naten::mm \u0027[[16800000, 128], [128, 134]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[128, 1], [134, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 10. mm 1285x2048x2048 NN (-13.1%)\naten::mm \u0027[[1285, 2048], [2048, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[2048, 1], [2048, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 11. mm 1285x2048x3840 NN (-12.9%)\naten::mm \u0027[[1285, 3840], [3840, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[3840, 1], [2048, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 12. mm 24576x512x2048 NN (-12.1%)\naten::mm \u0027[[24576, 2048], [2048, 512]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[2048, 1], [512, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 13. bmm 16x384x384x192 NT (-10.9%)\naten::bmm \u0027[[16, 384, 192], [16, 192, 384]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[73728, 192, 1], [73728, 1, 192]]\u0027 \u0027[\"\", \"\"]\u0027\n# 14. bmm 32x96x96x21 NN (-10.8%)\naten::bmm \u0027[[32, 96, 21], [32, 21, 96]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[2016, 21, 1], [2016, 96, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 15. bmm 32x96x96x96 NT (-10.4%)\naten::bmm \u0027[[32, 96, 96], [32, 96, 96]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[9216, 96, 1], [9216, 1, 96]]\u0027 \u0027[\"\", \"\"]\u0027\n# 16. bmm 32x96x96x96 NN (-9.4%)\naten::bmm \u0027[[32, 96, 96], [32, 96, 96]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[9216, 96, 1], [9216, 96, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 17. mm 528x2304x576 NN (-8.9%)\naten::mm \u0027[[528, 576], [576, 2304]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[576, 1], [2304, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 18. bmm 32x96x96x21 NN (-8.6%)\naten::bmm \u0027[[32, 96, 21], [32, 21, 96]]\u0027 \u0027[\"c10::BFloat16\", \"float\"]\u0027 \u0027[[2016, 21, 1], [2016, 96, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n# 19. addmm 1285x2048x2048 NT (-8.0%)\naten::addmm \u0027[[2048], [1285, 2048], [2048, 2048], [], []]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\", \"c10::BFloat16\", \"Scalar\", \"Scalar\"]\u0027 \u0027[[1], [2048, 1], [1, 2048], [], []]\u0027 \u0027[\"\", \"\", \"\", \"1\", \"1\"]\u0027\n# 20. mm 24576x2048x512 NN (-7.7%)\naten::mm \u0027[[24576, 512], [512, 2048]]\u0027 \u0027[\"c10::BFloat16\", \"c10::BFloat16\"]\u0027 \u0027[[512, 1], [2048, 1]]\u0027 \u0027[\"\", \"\"]\u0027\n```\n\n\u003c/details\u003e\n\n\u003c/details\u003e\n\n---------\n\nSigned-off-by: Yu-Zhewen \u003czhewenyu@amd.com\u003e",
  "tree_diff": [
    {
      "type": "modify",
      "old_id": "888f2950483c7f7ec90e371f0c3aacb5dc91a9f0",
      "old_mode": 33188,
      "old_path": "compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp",
      "new_id": "08c4d773302cf4b04faea0ffaaf959deafc60dde",
      "new_mode": 33188,
      "new_path": "compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp"
    },
    {
      "type": "modify",
      "old_id": "704ca470c4d8926d3d086f10e1b5e327806fb392",
      "old_mode": 33188,
      "old_path": "compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp",
      "new_id": "2ee423bbf3ad64c7550e6240f11206d7d780df3a",
      "new_mode": 33188,
      "new_path": "compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp"
    },
    {
      "type": "modify",
      "old_id": "9e717760e618cfc8c30532096cd345b741f8af26",
      "old_mode": 33188,
      "old_path": "compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp",
      "new_id": "838ac8d07b54eec0afcd36667ba824c23f78b422",
      "new_mode": 33188,
      "new_path": "compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp"
    },
    {
      "type": "modify",
      "old_id": "a2941f82ce412df95050d92ef52ce2a843330d85",
      "old_mode": 33188,
      "old_path": "compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir",
      "new_id": "38f1f1e4ff9ae3294baa06677017dd504b99389f",
      "new_mode": 33188,
      "new_path": "compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir"
    },
    {
      "type": "modify",
      "old_id": "1df72308e20f46d89591132b30a5b8450179a4f7",
      "old_mode": 33188,
      "old_path": "compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp",
      "new_id": "b761188fc6e141e7cdf280d9252ed050333009e3",
      "new_mode": 33188,
      "new_path": "compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp"
    },
    {
      "type": "modify",
      "old_id": "9680289083ce4a0080a44f5aa363f66220f4f518",
      "old_mode": 33188,
      "old_path": "compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h",
      "new_id": "f49c74f40b3b5a8d5581c0d48367d4ddaf74b866",
      "new_mode": 33188,
      "new_path": "compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h"
    }
  ]
}
