compiler/src/iree/compiler/Codegen/Common/GPU/GPUTileAndConvertConvToMatmul.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2025 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/compiler/Codegen/Common/TileAndFuseUtils.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.h"
 #include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"

 namespace mlir::iree_compiler {

 #define GEN_PASS_DEF_GPUTILEANDCONVERTCONVTOMATMULPASS
 #include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"

 namespace {
 struct GPUTileAndConvertConvToMatmulPass final
     : impl::GPUTileAndConvertConvToMatmulPassBase<
           GPUTileAndConvertConvToMatmulPass> {
   void runOnOperation() override;
 };
 } // namespace

 // TODO : Upstream utility that does this pruning is broken for LinalgOp. Drop
 // this if that gets fixed.
 static SmallVector<NamedAttribute> getPrunedAttributeList(linalg::LinalgOp op) {
   const StringLiteral memoAttr =
       linalg::LinalgDialect::kMemoizedIndexingMapsAttrName;
   SmallVector<NamedAttribute> prunedAttributeList;
   for (auto attr : op->getDiscardableAttrs()) {
     if (attr.getName() != memoAttr) {
       prunedAttributeList.push_back(attr);
     }
   }
   return prunedAttributeList;
 }

 // Helper to remove unit filter loop dimensions from input map of convolution
 // operations so that they can become contractions.
 void static removeUnitExtentDimsfromMaps(linalg::LinalgOp linalgOp,
                                          RewriterBase &rewriter) {
   auto convDimsOrFailure = linalg::inferConvolutionDims(linalgOp);
   if (failed(convDimsOrFailure)) {
     return;
   }
   const mlir::linalg::ConvolutionDimensions &convDims = *convDimsOrFailure;
   // We cant make strided convolutions into contractions directly so bail out.
   if (llvm::any_of(convDims.strides,
                    [](int64_t stride) { return stride != 1; })) {
     return;
   }
   SmallVector<AffineMap> indexingMaps = linalgOp.getIndexingMapsArray();
   if (indexingMaps.empty()) {
     return;
   }
   AffineMap inputMap = indexingMaps[0];
   AffineMap filterMap = indexingMaps[1];
   AffineMap outputMap = indexingMaps[2];

   // Check that all filter loop dimensions are unit and then make them zero.
   DenseMap<AffineExpr, AffineExpr> dimMap;
   Value filter = linalgOp.getDpsInputs()[1];
   auto filterType = cast<ShapedType>(filter.getType());
   ArrayRef<int64_t> filterShape = filterType.getShape();
   for (auto filterLoop : convDims.filterLoop) {
     std::optional<int64_t> maybeDim = filterMap.getResultPosition(
         getAffineDimExpr(filterLoop, filterMap.getContext()));
     if (!maybeDim || filterShape[maybeDim.value()] != 1) {
       return;
     }
     dimMap[rewriter.getAffineDimExpr(filterLoop)] =
         getAffineConstantExpr(0, filterMap.getContext());
   }
   ArrayRef<AffineExpr> newResults = inputMap.replace(dimMap).getResults();
   auto newInputMap = AffineMap::get(inputMap.getNumDims(), 0, newResults,
                                     inputMap.getContext());

   // No changes to the filter and output map.
   AffineMap newIndexingMaps[] = {newInputMap, filterMap, outputMap};

   // Create the new contraction op and replace the old convolution op.
   auto newOp = linalg::GenericOp::create(
       rewriter, linalgOp.getLoc(), linalgOp.getDpsInits().getType(),
       linalgOp.getDpsInputs(), linalgOp.getDpsInits(), newIndexingMaps,
       linalgOp.getIteratorTypesArray(), /*bodyBuild=*/nullptr,
       getPrunedAttributeList(linalgOp));
   rewriter.inlineRegionBefore(linalgOp->getRegion(0), newOp.getRegion(),
                               newOp.getRegion().begin());
   rewriter.replaceOp(linalgOp, newOp.getResults());
 }

 void GPUTileAndConvertConvToMatmulPass::runOnOperation() {
   MLIRContext *context = &getContext();
   mlir::FunctionOpInterface funcOp = getOperation();
   // Collect candiates that need to be tiled to convert to matmul.
   IRRewriter rewriter(funcOp);
   SmallVector<linalg::LinalgOp> convCandidates;
   funcOp->walk([&](linalg::LinalgOp linalgOp) {
     auto loweringConfig =
         getLoweringConfig<IREE::GPU::LoweringConfigAttr>(linalgOp);
     if (!loweringConfig) {
       return;
     }
     if (!getMmaKind(loweringConfig)) {
       return;
     }
     auto convDimsOrFailure = linalg::inferConvolutionDims(linalgOp);
     if (failed(convDimsOrFailure)) {
       return;
     }
     convCandidates.push_back(linalgOp);
   });
   // Handle convolution operations by tiling the filter dimensions to 1 so that
   // they can become contractions.
   llvm::SmallDenseSet<TilingInterface> targets;
   llvm::SmallDenseMap<TilingInterface, SmallVector<OpFoldResult>> targetTileMap;
   auto zero = rewriter.getIndexAttr(0);
   auto one = rewriter.getIndexAttr(1);
   for (auto candidate : convCandidates) {
     SmallVector<OpFoldResult> directTileSizes(candidate.getNumLoops(), zero);
     auto convDimsOrFailure = linalg::inferConvolutionDims(candidate);
     for (auto loopDim : convDimsOrFailure->filterLoop) {
       directTileSizes[loopDim] = one;
     }
     auto tilingOp = dyn_cast<TilingInterface>(*candidate);
     targets.insert(tilingOp);
     targetTileMap[tilingOp] = directTileSizes;
   }
   IREE::GPU::TilingLevel reductionLevel = IREE::GPU::TilingLevel::Reduction;
   if (failed(applyTileAndFuseToEachRoot(rewriter, targets, reductionLevel,
                                         /*allowZeroSlices=*/true,
                                         targetTileMap))) {
     funcOp.emitError() << "tiling of level  convolution failed\n";
   }
   // Collect candiates again since the old candidates are not valid
   // after convolution tiling.
   convCandidates = {};
   funcOp->walk([&](linalg::LinalgOp linalgOp) {
     auto loweringConfig =
         getLoweringConfig<IREE::GPU::LoweringConfigAttr>(linalgOp);
     if (!loweringConfig) {
       return;
     }
     // Currently we only convert convolutions that have a MMA attr
     // in there configurations as this is meant to be used for
     // lowering the convolutions to matmul intrinsic. If we
     // want to do this for all convolutions we can drop this check
     // and move this pass to the common directory.
     if (!getMmaKind(loweringConfig)) {
       return;
     }
     convCandidates.push_back(linalgOp);
   });

   // Remove unit extent filter reductions dims from input maps of convolution
   // operations which would make them contractions.
   for (auto candidate : convCandidates) {
     rewriter.setInsertionPoint(candidate);
     removeUnitExtentDimsfromMaps(candidate, rewriter);
   }

   // Apply cleanup patterns.
   {
     RewritePatternSet patterns(context);
     // Merge consecutive insert/extract slice ops to simplify later loop
     // hoisting patterns.
     tensor::populateFoldTensorEmptyPatterns(patterns);
     tensor::populateMergeConsecutiveInsertExtractSlicePatterns(patterns);
     tensor::InsertSliceOp::getCanonicalizationPatterns(patterns, context);
     tensor::ExtractSliceOp::getCanonicalizationPatterns(patterns, context);
     scf::ForOp::getCanonicalizationPatterns(patterns, context);
     if (failed(applyPatternsGreedily(funcOp, std::move(patterns)))) {
       funcOp.emitError() << "tiling cleanup failed\n";
       return signalPassFailure();
     }
   }
 }

 } // namespace mlir::iree_compiler
	// Copyright 2025 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	#include "iree/compiler/Codegen/Common/TileAndFuseUtils.h"
	#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
	#include "iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h"
	#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
	#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
	#include "iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.h"
	#include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h"
	#include "mlir/Dialect/Linalg/IR/Linalg.h"
	#include "mlir/Interfaces/FunctionInterfaces.h"
	#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

	namespace mlir::iree_compiler {

	#define GEN_PASS_DEF_GPUTILEANDCONVERTCONVTOMATMULPASS
	#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"

	namespace {
	struct GPUTileAndConvertConvToMatmulPass final
	: impl::GPUTileAndConvertConvToMatmulPassBase<
	GPUTileAndConvertConvToMatmulPass> {
	void runOnOperation() override;
	};
	} // namespace

	// TODO : Upstream utility that does this pruning is broken for LinalgOp. Drop
	// this if that gets fixed.
	static SmallVector<NamedAttribute> getPrunedAttributeList(linalg::LinalgOp op) {
	const StringLiteral memoAttr =
	linalg::LinalgDialect::kMemoizedIndexingMapsAttrName;
	SmallVector<NamedAttribute> prunedAttributeList;
	for (auto attr : op->getDiscardableAttrs()) {
	if (attr.getName() != memoAttr) {
	prunedAttributeList.push_back(attr);
	}
	}
	return prunedAttributeList;
	}

	// Helper to remove unit filter loop dimensions from input map of convolution
	// operations so that they can become contractions.
	void static removeUnitExtentDimsfromMaps(linalg::LinalgOp linalgOp,
	RewriterBase &rewriter) {
	auto convDimsOrFailure = linalg::inferConvolutionDims(linalgOp);
	if (failed(convDimsOrFailure)) {
	return;
	}
	const mlir::linalg::ConvolutionDimensions &convDims = *convDimsOrFailure;
	// We cant make strided convolutions into contractions directly so bail out.
	if (llvm::any_of(convDims.strides,
	[](int64_t stride) { return stride != 1; })) {
	return;
	}
	SmallVector<AffineMap> indexingMaps = linalgOp.getIndexingMapsArray();
	if (indexingMaps.empty()) {
	return;
	}
	AffineMap inputMap = indexingMaps[0];
	AffineMap filterMap = indexingMaps[1];
	AffineMap outputMap = indexingMaps[2];

	// Check that all filter loop dimensions are unit and then make them zero.
	DenseMap<AffineExpr, AffineExpr> dimMap;
	Value filter = linalgOp.getDpsInputs()[1];
	auto filterType = cast<ShapedType>(filter.getType());
	ArrayRef<int64_t> filterShape = filterType.getShape();
	for (auto filterLoop : convDims.filterLoop) {
	std::optional<int64_t> maybeDim = filterMap.getResultPosition(
	getAffineDimExpr(filterLoop, filterMap.getContext()));
	if (!maybeDim \|\| filterShape[maybeDim.value()] != 1) {
	return;
	}
	dimMap[rewriter.getAffineDimExpr(filterLoop)] =
	getAffineConstantExpr(0, filterMap.getContext());
	}
	ArrayRef<AffineExpr> newResults = inputMap.replace(dimMap).getResults();
	auto newInputMap = AffineMap::get(inputMap.getNumDims(), 0, newResults,
	inputMap.getContext());

	// No changes to the filter and output map.
	AffineMap newIndexingMaps[] = {newInputMap, filterMap, outputMap};

	// Create the new contraction op and replace the old convolution op.
	auto newOp = linalg::GenericOp::create(
	rewriter, linalgOp.getLoc(), linalgOp.getDpsInits().getType(),
	linalgOp.getDpsInputs(), linalgOp.getDpsInits(), newIndexingMaps,
	linalgOp.getIteratorTypesArray(), /bodyBuild=/nullptr,
	getPrunedAttributeList(linalgOp));
	rewriter.inlineRegionBefore(linalgOp->getRegion(0), newOp.getRegion(),
	newOp.getRegion().begin());
	rewriter.replaceOp(linalgOp, newOp.getResults());
	}

	void GPUTileAndConvertConvToMatmulPass::runOnOperation() {
	MLIRContext *context = &getContext();
	mlir::FunctionOpInterface funcOp = getOperation();
	// Collect candiates that need to be tiled to convert to matmul.
	IRRewriter rewriter(funcOp);
	SmallVector<linalg::LinalgOp> convCandidates;
	funcOp->walk([&](linalg::LinalgOp linalgOp) {
	auto loweringConfig =
	getLoweringConfig<IREE::GPU::LoweringConfigAttr>(linalgOp);
	if (!loweringConfig) {
	return;
	}
	if (!getMmaKind(loweringConfig)) {
	return;
	}
	auto convDimsOrFailure = linalg::inferConvolutionDims(linalgOp);
	if (failed(convDimsOrFailure)) {
	return;
	}
	convCandidates.push_back(linalgOp);
	});
	// Handle convolution operations by tiling the filter dimensions to 1 so that
	// they can become contractions.
	llvm::SmallDenseSet<TilingInterface> targets;
	llvm::SmallDenseMap<TilingInterface, SmallVector<OpFoldResult>> targetTileMap;
	auto zero = rewriter.getIndexAttr(0);
	auto one = rewriter.getIndexAttr(1);
	for (auto candidate : convCandidates) {
	SmallVector<OpFoldResult> directTileSizes(candidate.getNumLoops(), zero);
	auto convDimsOrFailure = linalg::inferConvolutionDims(candidate);
	for (auto loopDim : convDimsOrFailure->filterLoop) {
	directTileSizes[loopDim] = one;
	}
	auto tilingOp = dyn_cast<TilingInterface>(*candidate);
	targets.insert(tilingOp);
	targetTileMap[tilingOp] = directTileSizes;
	}
	IREE::GPU::TilingLevel reductionLevel = IREE::GPU::TilingLevel::Reduction;
	if (failed(applyTileAndFuseToEachRoot(rewriter, targets, reductionLevel,
	/allowZeroSlices=/true,
	targetTileMap))) {
	funcOp.emitError() << "tiling of level convolution failed\n";
	}
	// Collect candiates again since the old candidates are not valid
	// after convolution tiling.
	convCandidates = {};
	funcOp->walk([&](linalg::LinalgOp linalgOp) {
	auto loweringConfig =
	getLoweringConfig<IREE::GPU::LoweringConfigAttr>(linalgOp);
	if (!loweringConfig) {
	return;
	}
	// Currently we only convert convolutions that have a MMA attr
	// in there configurations as this is meant to be used for
	// lowering the convolutions to matmul intrinsic. If we
	// want to do this for all convolutions we can drop this check
	// and move this pass to the common directory.
	if (!getMmaKind(loweringConfig)) {
	return;
	}
	convCandidates.push_back(linalgOp);
	});

	// Remove unit extent filter reductions dims from input maps of convolution
	// operations which would make them contractions.
	for (auto candidate : convCandidates) {
	rewriter.setInsertionPoint(candidate);
	removeUnitExtentDimsfromMaps(candidate, rewriter);
	}

	// Apply cleanup patterns.
	{
	RewritePatternSet patterns(context);
	// Merge consecutive insert/extract slice ops to simplify later loop
	// hoisting patterns.
	tensor::populateFoldTensorEmptyPatterns(patterns);
	tensor::populateMergeConsecutiveInsertExtractSlicePatterns(patterns);
	tensor::InsertSliceOp::getCanonicalizationPatterns(patterns, context);
	tensor::ExtractSliceOp::getCanonicalizationPatterns(patterns, context);
	scf::ForOp::getCanonicalizationPatterns(patterns, context);
	if (failed(applyPatternsGreedily(funcOp, std::move(patterns)))) {
	funcOp.emitError() << "tiling cleanup failed\n";
	return signalPassFailure();
	}
	}
	}

	} // namespace mlir::iree_compiler