iree/compiler/Codegen/LLVMGPU/LLVMGPUPipelining.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2021 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Codegen/Passes.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/GPU/Passes.h"
 #include "mlir/Dialect/SCF/Transforms.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"

 //====---------------------------------------------------------------------===//
 // Pass to pipeline copy to shared memory for matmul op.
 //====---------------------------------------------------------------------===//

 namespace mlir {
 namespace iree_compiler {

 static const StringLiteral kPipeliningLoopMarker = "__pipelining_K_loop__";
 static const StringLiteral kPipeliningGlobalLoad = "__pipelining_global_load__";

 /// Helper to recursively add operation dependencies within `block` to `dep`
 /// set.
 static void addDepOps(llvm::SmallDenseSet<Operation*>& dep, Operation* op,
                       Block* block) {
   if (!dep.insert(op).second) return;
   for (Value operand : op->getOperands()) {
     Operation* defOp = operand.getDefiningOp();
     if (defOp && defOp->getBlock() == block) addDepOps(dep, defOp, block);
   }
 }

 /// Assign stages to the loop ops. Simple logic for now, put load from global
 /// memory in stage 0 and the rest in stage 1.
 static void getPipelineStages(scf::ForOp forOp,
                               std::vector<std::pair<Operation*, unsigned>>& ops,
                               unsigned depth) {
   if (!forOp->hasAttr(kPipeliningLoopMarker)) return;

   // Track dependencies of the global memory load.
   llvm::SmallDenseSet<Operation*> loadDep;
   for (Operation& op : forOp.getBody()->getOperations()) {
     if (op.hasAttr(kPipeliningGlobalLoad)) {
       addDepOps(loadDep, &op, forOp.getBody());
     }
   }
   // Create a modulo schedule with loads from global memory and the operations
   // it depends on in stage 0. Store to shared memory and computation are in
   // stage `maxDepth`. In order to have a correct scheduling even with back
   // edges we order stages in decreasing order.
   for (Operation& op : forOp.getBody()->getOperations()) {
     if (!loadDep.count(&op) && !isa<scf::YieldOp>(op))
       ops.push_back(std::make_pair(&op, depth));
   }
   for (Operation& op : forOp.getBody()->getOperations()) {
     if (loadDep.count(&op)) ops.push_back(std::make_pair(&op, 0));
   }
 }

 static void setAsyncAnnotations(Operation* op,
                                 scf::PipeliningOption::PipelinerPart part,
                                 unsigned iteration, unsigned depth) {
   auto waitOp = dyn_cast<gpu::DeviceAsyncWaitOp>(op);
   if (!waitOp || waitOp.numGroups()) return;
   int numGroupInFlight = 0;
   if (part == scf::PipeliningOption::PipelinerPart::Kernel) {
     numGroupInFlight = depth - 1;
   } else {
     // By construction there should be no wait op in the prologue as all the
     // wait should be in the last stage.
     assert(part == scf::PipeliningOption::PipelinerPart::Epilogue);
     // Based on the schedule we pick we know how many groups are in flight for
     // each iteration of the epilogue.
     numGroupInFlight = depth - 1 - iteration;
   }
   OpBuilder b(op);
   waitOp->setAttr(waitOp.numGroupsAttrName(),
                   b.getI32IntegerAttr(numGroupInFlight));
 }

 namespace {
 struct LLVMGPUPipeliningPass
     : public LLVMGPUPipeliningBase<LLVMGPUPipeliningPass> {
   LLVMGPUPipeliningPass(unsigned depth) : depth(depth) {}
   void runOnOperation() override {
     auto funcOp = getOperation();
     MLIRContext* context = &getContext();
     // Mark the loop with shared memory copy for pipelining.
     funcOp.walk([](scf::ForOp forOp) {
       bool copyToWorkgroupMemory = false;
       OpBuilder builder(forOp.getContext());
       SmallVector<Operation*> barriers;
       for (Operation& op : forOp.getBody()->getOperations()) {
         // Pipeline the most inner for op that should be a flat region.
         if (op.getNumRegions() > 0) return;
         if (isa<gpu::BarrierOp>(op)) {
           barriers.push_back(&op);
         }
         if (isa<gpu::DeviceAsyncCopyOp, gpu::DeviceAsyncCreateGroupOp>(op)) {
           copyToWorkgroupMemory = true;
           op.setAttr(kPipeliningGlobalLoad, builder.getUnitAttr());
           // async copy ops need to be moved along with previous barrier.
           for (Operation* barrier : barriers) {
             barrier->setAttr(kPipeliningGlobalLoad, builder.getUnitAttr());
           }
           barriers.clear();
           continue;
         }
         auto ld = dyn_cast<vector::TransferReadOp>(op);
         if (!ld) continue;
         unsigned ldAddSpace =
             ld.getSource().getType().cast<MemRefType>().getMemorySpaceAsInt();
         if (ldAddSpace != 0 || !ld->hasOneUse()) continue;
         auto st =
             dyn_cast<vector::TransferWriteOp>(ld->use_begin()->getOwner());
         if (!st) continue;
         unsigned stAddSpace =
             st.getSource().getType().cast<MemRefType>().getMemorySpaceAsInt();
         if (stAddSpace != 3) continue;
         copyToWorkgroupMemory = true;
         ld->setAttr(kPipeliningGlobalLoad, builder.getUnitAttr());
       }
       if (copyToWorkgroupMemory) {
         forOp->setAttr(kPipeliningLoopMarker, builder.getUnitAttr());
       }
     });
     scf::PipeliningOption options;
     unsigned maxDepth = depth;
     auto getSchedule = [maxDepth](
                            scf::ForOp forOp,
                            std::vector<std::pair<Operation*, unsigned>>& ops) {
       return getPipelineStages(forOp, ops, maxDepth);
     };
     auto setAnnotation = [maxDepth](Operation* op,
                                     scf::PipeliningOption::PipelinerPart part,
                                     unsigned iteration) {
       return setAsyncAnnotations(op, part, iteration, maxDepth);
     };
     options.getScheduleFn = getSchedule;
     options.annotateFn = setAnnotation;
     RewritePatternSet pipeliningPatterns(context);
     scf::populateSCFLoopPipeliningPatterns(pipeliningPatterns, options);
     if (failed(applyPatternsAndFoldGreedily(funcOp,
                                             std::move(pipeliningPatterns)))) {
       return signalPassFailure();
     }
   }

  private:
   unsigned depth;
 };
 }  // namespace

 std::unique_ptr<OperationPass<func::FuncOp>> createLLVMGPUPipeliningPass(
     unsigned depth) {
   return std::make_unique<LLVMGPUPipeliningPass>(depth);
 }

 }  // namespace iree_compiler
 }  // namespace mlir
	// Copyright 2021 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	#include "iree/compiler/Codegen/PassDetail.h"
	#include "iree/compiler/Codegen/Passes.h"
	#include "iree/compiler/Codegen/Utils/Utils.h"
	#include "mlir/Dialect/GPU/GPUDialect.h"
	#include "mlir/Dialect/GPU/Passes.h"
	#include "mlir/Dialect/SCF/Transforms.h"
	#include "mlir/Dialect/Vector/IR/VectorOps.h"
	#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

	//====---------------------------------------------------------------------===//
	// Pass to pipeline copy to shared memory for matmul op.
	//====---------------------------------------------------------------------===//

	namespace mlir {
	namespace iree_compiler {

	static const StringLiteral kPipeliningLoopMarker = "__pipelining_K_loop__";
	static const StringLiteral kPipeliningGlobalLoad = "__pipelining_global_load__";

	/// Helper to recursively add operation dependencies within `block` to `dep`
	/// set.
	static void addDepOps(llvm::SmallDenseSet<Operation>& dep, Operation op,
	Block* block) {
	if (!dep.insert(op).second) return;
	for (Value operand : op->getOperands()) {
	Operation* defOp = operand.getDefiningOp();
	if (defOp && defOp->getBlock() == block) addDepOps(dep, defOp, block);
	}
	}

	/// Assign stages to the loop ops. Simple logic for now, put load from global
	/// memory in stage 0 and the rest in stage 1.
	static void getPipelineStages(scf::ForOp forOp,
	std::vector<std::pair<Operation*, unsigned>>& ops,
	unsigned depth) {
	if (!forOp->hasAttr(kPipeliningLoopMarker)) return;

	// Track dependencies of the global memory load.
	llvm::SmallDenseSet<Operation*> loadDep;
	for (Operation& op : forOp.getBody()->getOperations()) {
	if (op.hasAttr(kPipeliningGlobalLoad)) {
	addDepOps(loadDep, &op, forOp.getBody());
	}
	}
	// Create a modulo schedule with loads from global memory and the operations
	// it depends on in stage 0. Store to shared memory and computation are in
	// stage `maxDepth`. In order to have a correct scheduling even with back
	// edges we order stages in decreasing order.
	for (Operation& op : forOp.getBody()->getOperations()) {
	if (!loadDep.count(&op) && !isa<scf::YieldOp>(op))
	ops.push_back(std::make_pair(&op, depth));
	}
	for (Operation& op : forOp.getBody()->getOperations()) {
	if (loadDep.count(&op)) ops.push_back(std::make_pair(&op, 0));
	}
	}

	static void setAsyncAnnotations(Operation* op,
	scf::PipeliningOption::PipelinerPart part,
	unsigned iteration, unsigned depth) {
	auto waitOp = dyn_cast<gpu::DeviceAsyncWaitOp>(op);
	if (!waitOp \|\| waitOp.numGroups()) return;
	int numGroupInFlight = 0;
	if (part == scf::PipeliningOption::PipelinerPart::Kernel) {
	numGroupInFlight = depth - 1;
	} else {
	// By construction there should be no wait op in the prologue as all the
	// wait should be in the last stage.
	assert(part == scf::PipeliningOption::PipelinerPart::Epilogue);
	// Based on the schedule we pick we know how many groups are in flight for
	// each iteration of the epilogue.
	numGroupInFlight = depth - 1 - iteration;
	}
	OpBuilder b(op);
	waitOp->setAttr(waitOp.numGroupsAttrName(),
	b.getI32IntegerAttr(numGroupInFlight));
	}

	namespace {
	struct LLVMGPUPipeliningPass
	: public LLVMGPUPipeliningBase<LLVMGPUPipeliningPass> {
	LLVMGPUPipeliningPass(unsigned depth) : depth(depth) {}
	void runOnOperation() override {
	auto funcOp = getOperation();
	MLIRContext* context = &getContext();
	// Mark the loop with shared memory copy for pipelining.
	funcOp.walk([](scf::ForOp forOp) {
	bool copyToWorkgroupMemory = false;
	OpBuilder builder(forOp.getContext());
	SmallVector<Operation*> barriers;
	for (Operation& op : forOp.getBody()->getOperations()) {
	// Pipeline the most inner for op that should be a flat region.
	if (op.getNumRegions() > 0) return;
	if (isa<gpu::BarrierOp>(op)) {
	barriers.push_back(&op);
	}
	if (isa<gpu::DeviceAsyncCopyOp, gpu::DeviceAsyncCreateGroupOp>(op)) {
	copyToWorkgroupMemory = true;
	op.setAttr(kPipeliningGlobalLoad, builder.getUnitAttr());
	// async copy ops need to be moved along with previous barrier.
	for (Operation* barrier : barriers) {
	barrier->setAttr(kPipeliningGlobalLoad, builder.getUnitAttr());
	}
	barriers.clear();
	continue;
	}
	auto ld = dyn_cast<vector::TransferReadOp>(op);
	if (!ld) continue;
	unsigned ldAddSpace =
	ld.getSource().getType().cast<MemRefType>().getMemorySpaceAsInt();
	if (ldAddSpace != 0 \|\| !ld->hasOneUse()) continue;
	auto st =
	dyn_cast<vector::TransferWriteOp>(ld->use_begin()->getOwner());
	if (!st) continue;
	unsigned stAddSpace =
	st.getSource().getType().cast<MemRefType>().getMemorySpaceAsInt();
	if (stAddSpace != 3) continue;
	copyToWorkgroupMemory = true;
	ld->setAttr(kPipeliningGlobalLoad, builder.getUnitAttr());
	}
	if (copyToWorkgroupMemory) {
	forOp->setAttr(kPipeliningLoopMarker, builder.getUnitAttr());
	}
	});
	scf::PipeliningOption options;
	unsigned maxDepth = depth;
	auto getSchedule = [maxDepth](
	scf::ForOp forOp,
	std::vector<std::pair<Operation*, unsigned>>& ops) {
	return getPipelineStages(forOp, ops, maxDepth);
	};
	auto setAnnotation = [maxDepth](Operation* op,
	scf::PipeliningOption::PipelinerPart part,
	unsigned iteration) {
	return setAsyncAnnotations(op, part, iteration, maxDepth);
	};
	options.getScheduleFn = getSchedule;
	options.annotateFn = setAnnotation;
	RewritePatternSet pipeliningPatterns(context);
	scf::populateSCFLoopPipeliningPatterns(pipeliningPatterns, options);
	if (failed(applyPatternsAndFoldGreedily(funcOp,
	std::move(pipeliningPatterns)))) {
	return signalPassFailure();
	}
	}

	private:
	unsigned depth;
	};
	} // namespace

	std::unique_ptr<OperationPass<func::FuncOp>> createLLVMGPUPipeliningPass(
	unsigned depth) {
	return std::make_unique<LLVMGPUPipeliningPass>(depth);
	}

	} // namespace iree_compiler
	} // namespace mlir