Revert "[Codegen] Enable DMA by default for F16/BF16 Gemm on gfx950 (#24373)" (#24395) This reverts commit 4f990431a73902c02288fd5892ddf4540b72998b, due to performance regression on some 1x1 conv. Signed-off-by: Yu-Zhewen <zhewenyu@amd.com>

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp
index 08c4d77..888f295 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp

@@ -173,6 +173,21 @@
   return hasAMDGPUFatRawBufferAddressSpace(memrefType);
 }
 
+/// Check if the target architecture supports global load DMA.
+/// Returns true only for CDNA4+ (gfx950+) architectures.
+static bool targetSupportsGlobalLoadDMA(IREE::GPU::TargetAttr target) {
+  if (!target) {
+    return false;
+  }
+  FailureOr<amdgpu::Chipset> chipset = amdgpu::Chipset::parse(target.getArch());
+  if (failed(chipset)) {
+    return false;
+  }
+  // CDNA4 is gfx950+ (major=9, minor>=5). Other major versions (RDNA, etc.)
+  // do not support global load DMA.
+  return chipset->majorVersion == 9 && chipset->minorVersion >= 5;
+}
+
 /// Returns the subgroup size if the available elements are aligned to DMA
 /// transfer sizes, std::nullopt otherwise.
 static std::optional<int64_t>

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index 626c210..9ab5d4e 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp

@@ -557,42 +557,6 @@
   return splitReductionTripCnt;
 }
 
-/// Returns true if direct load DMA should be rejected, and fall back to stream
-/// copies.
-///
-/// Rejection cases:
-///   1. Target does not support DMA (requires gfx950+ / CDNA4+).
-///   2. Not a GEMM. TODO(#23907): support convolution.
-///   3. Data types are not f16 or bf16. TODO(#22119): support MXFP4.
-///   4. LHS transposed, RHS not transposed shows regressions. TODO (#24117).
-static bool shouldRejectDirectLoadDMA(IREE::GPU::TargetAttr target, bool isGemm,
-                                      Type lhsElemType, Type rhsElemType,
-                                      bool transposedLhs, bool transposedRhs) {
-  auto isF16OrBF16 = [](Type t) { return t.isF16() || t.isBF16(); };
-
-  // Case 1: DMA requires hardware support (gfx950+ / CDNA4+).
-  if (!targetSupportsGlobalLoadDMA(target)) {
-    return true;
-  }
-
-  // Case 2: Only GEMM are supported currently.
-  if (!isGemm) {
-    return true;
-  }
-
-  // Case 3: Only f16/bf16 are supported currently.
-  if (!isF16OrBF16(lhsElemType) || !isF16OrBF16(rhsElemType)) {
-    return true;
-  }
-
-  // Case 4: LHS transposed, RHS not transposed show regressions with DMA.
-  if (transposedLhs && !transposedRhs) {
-    return true;
-  }
-
-  return false;
-}
-
 /// Create a lowering config for matmul or IGEMM convolution based on iteration
 /// bounds and indexing maps for a given target. This function computes
 /// contraction dimensions and deduces an MMA intrinsic schedule to choose tile
@@ -608,7 +572,7 @@
 getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
     ArrayRef<int64_t> bounds, ArrayRef<AffineMap> maps,
     ArrayRef<Value> operands, IREE::GPU::TargetAttr target, bool isGemm,
-    bool scaled, bool &useDirectLoad, int64_t prefetchNumStages,
+    bool scaled, bool useDirectLoad, int64_t prefetchNumStages,
     int64_t splitReductionTripCnt, bool hasExistingAccumulator = false,
     std::optional<ConvToIgemmInfo> convToIgemmInfo = std::nullopt) {
   if (target.getWgp().getMma().empty()) {
@@ -786,11 +750,13 @@
                              lhsScaleType,
                              rhsScaleType};
 
+  // TODO(#22119): We don't use global load DMA for scaled matmuls, because
+  // compilation doesn't support it. Once this is fixed, we should use global
+  // load DMA here when possible.
   Location loc = operands[0].getLoc();
-  if (useDirectLoad &&
-      shouldRejectDirectLoadDMA(target, isGemm, lhsElemType, rhsElemType,
-                                transposedLhs, transposedRhs)) {
-    LDBG() << "overriding direct load DMA, falling back to stream copies";
+  if (scaled && useDirectLoad) {
+    mlir::emitWarning(loc) << "direct load (global load DMA) is not yet "
+                              "supported for scaled matmuls, ignoring";
     useDirectLoad = false;
   }
 
@@ -916,7 +882,7 @@
     // Apply XOR swizzle for BF16 DMA operands whose reduction dim is
     // innermost (contiguous reads) to avoid LDS bank conflicts.
     // TODO(#24255): Fix untuned swizzle logic for DMA.
-    if (!transposedLhs) {
+    if (lhsElemType.isBF16() && !transposedLhs) {
       FailureOr<Attribute> lhsSwizzleAttr = getXorShuffleAttr(
           context, lhsAttr, target, kind, schedule->kTileSizes, kMMAOperandLhs,
           /*skipUntunedFallback=*/true);
@@ -924,7 +890,7 @@
         lhsAttr = *lhsSwizzleAttr;
       }
     }
-    if (transposedRhs) {
+    if (rhsElemType.isBF16() && transposedRhs) {
       FailureOr<Attribute> rhsSwizzleAttr = getXorShuffleAttr(
           context, rhsAttr, target, kind, schedule->kTileSizes, kMMAOperandRhs,
           /*skipUntunedFallback=*/true);

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 1f28eff..8a5f83d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp

@@ -106,7 +106,7 @@
 static llvm::cl::opt<bool>
     clUseDirectLoad("iree-llvmgpu-use-direct-load",
                     llvm::cl::desc("Use global load DMA for direct load ops."),
-                    llvm::cl::Hidden, llvm::cl::init(true));
+                    llvm::cl::Hidden, llvm::cl::init(false));
 
 static llvm::cl::opt<bool> clDirectConvolution(
     "iree-codegen-llvmgpu-use-direct-convolution",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir
index 2673f4c..dc91220 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir

@@ -1,11 +1,11 @@
 // RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx950 \
 // RUN: --iree-codegen-llvmgpu-use-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
-// RUN: --iree-codegen-llvmgpu-use-igemm=false --iree-llvmgpu-use-direct-load=false \
+// RUN: --iree-codegen-llvmgpu-use-igemm=false \
 // RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s
 
 // RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx950 \
 // RUN: --iree-codegen-llvmgpu-use-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
-// RUN: --iree-codegen-llvmgpu-use-igemm=false --iree-llvmgpu-use-direct-load=false \
+// RUN: --iree-codegen-llvmgpu-use-igemm=false \
 // RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
 // RUN: --remarks-filter=".*" %s 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS
 
@@ -36,6 +36,11 @@
 // RUN: --iree-codegen-llvmgpu-use-igemm=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
 // RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s --check-prefix=IGEMM
 
+// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx950 \
+// RUN: --iree-codegen-llvmgpu-use-igemm=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
+// RUN: --iree-llvmgpu-use-direct-load=true \
+// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s \
+// RUN: | FileCheck %s --check-prefix=IGEMM-DIRECT-LOAD
 
 #lhs_map = affine_map<(M, N, Ko, Kb) -> (M, Ko, Kb)>
 #rhs_map = affine_map<(M, N, Ko, Kb) -> (N, Ko, Kb)>
@@ -593,32 +598,21 @@
 
 // -----
 
-// BF16 1x1 conv (preprocessed to fold unit spatial dims) with DMA. The MMA intrinsic
-// (MFMA_F32_32x32x8_BF16) is not in the tuned swizzle table, so no XOR
-// swizzle should be applied -- only plain use_global_load_dma.
-func.func @conv_1x1_bf16_no_untuned_swizzle(
+// BF16 1x1 conv with DMA. The MMA intrinsic (MFMA_F32_32x32x8_BF16) is not in
+// the tuned swizzle table, so no XOR swizzle should be applied -- only plain
+// use_global_load_dma.
+func.func @conv_bf16_no_untuned_swizzle(
     %arg0: tensor<16x96x64x40xbf16>,
-    %arg1: tensor<40x40xbf16>) -> tensor<16x96x64x40xf32> {
+    %arg1: tensor<40x1x1x40xbf16>) -> tensor<16x96x64x40xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %empty = tensor.empty() : tensor<16x96x64x40xf32>
   %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<16x96x64x40xf32>) -> tensor<16x96x64x40xf32>
-  %result = linalg.generic {
-      indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>,
-                       affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>,
-                       affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>],
-      iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]
-  } ins(%arg0, %arg1 : tensor<16x96x64x40xbf16>, tensor<40x40xbf16>)
-    outs(%fill : tensor<16x96x64x40xf32>) {
-  ^bb0(%in: bf16, %in_1: bf16, %out: f32):
-    %0 = arith.extf %in : bf16 to f32
-    %1 = arith.extf %in_1 : bf16 to f32
-    %2 = arith.mulf %0, %1 : f32
-    %3 = arith.addf %out, %2 : f32
-    linalg.yield %3 : f32
-  } -> tensor<16x96x64x40xf32>
+  %result = linalg.conv_2d_nhwc_fhwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+    ins(%arg0, %arg1 : tensor<16x96x64x40xbf16>, tensor<40x1x1x40xbf16>)
+    outs(%fill : tensor<16x96x64x40xf32>) -> tensor<16x96x64x40xf32>
   return %result : tensor<16x96x64x40xf32>
 }
 
-// CHECK-DIRECT-LOAD-LABEL: func.func @conv_1x1_bf16_no_untuned_swizzle
-// CHECK-DIRECT-LOAD:       linalg.generic
-// CHECK-DIRECT-LOAD-SAME:    promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma]
+// IGEMM-DIRECT-LOAD-LABEL: func.func @conv_bf16_no_untuned_swizzle
+// IGEMM-DIRECT-LOAD:       linalg.conv_2d_nhwc_fhwc {
+// IGEMM-DIRECT-LOAD-SAME:    promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma]

diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
index dc5e343..e1adfa5 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp

@@ -756,7 +756,7 @@
                    int operandIndex) {
   SmallVector<Type> elementTypes;
   intrinsic.getElementTypes(elementTypes);
-  assert(operandIndex >= 0 && "operand index must be non-negative");
+  assert(operandIndex > 0 && "operand index must be positive");
   return elementTypes[operandIndex].getIntOrFloatBitWidth();
 }
 
@@ -819,8 +819,6 @@
   }
   if (auto mma = dyn_cast<IREE::GPU::MMAAttr>(intrinsic)) {
     switch (mma.getIntrinsic()) {
-    case IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x32_F16:
-    case IREE::GPU::MMAIntrinsic::MFMA_F32_32x32x16_F16:
     case IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x32_BF16:
     case IREE::GPU::MMAIntrinsic::MFMA_F32_32x32x16_BF16:
       return XorShuffleParams({/*rowElems=*/64,
@@ -1311,18 +1309,6 @@
   return getGPUTargetAttr(op->getContext(),
                           IREE::HAL::ExecutableTargetAttr::lookup(op));
 }
-
-bool targetSupportsGlobalLoadDMA(IREE::GPU::TargetAttr target) {
-  if (!target) {
-    return false;
-  }
-  FailureOr<amdgpu::Chipset> chipset = amdgpu::Chipset::parse(target.getArch());
-  if (failed(chipset)) {
-    return false;
-  }
-  return chipset->majorVersion == 9 && chipset->minorVersion >= 5;
-}
-
 void addConfigGPUTarget(MLIRContext *context,
                         IREE::GPU::TargetAttr gpuTargetAttr,
                         SmallVectorImpl<NamedAttribute> &config) {

diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
index 8037d8e..631e020 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h

@@ -330,10 +330,6 @@
                                        IREE::HAL::ExecutableTargetAttr attr);
 IREE::GPU::TargetAttr getGPUTargetAttr(Operation *op);
 
-/// Check if the target architecture supports global load DMA.
-/// Returns true only for CDNA4+ (gfx950+) architectures.
-bool targetSupportsGlobalLoadDMA(IREE::GPU::TargetAttr target);
-
 // Methods to retrieve information association with `configuration` field
 // of `hal.executable.target` attribute used commonly in GPU codegen pipelines.
 std::optional<int64_t> getConfigWavesPerEu(DictionaryAttr targetAttr);