compiler/src/iree/compiler/Codegen/SPIRV/AMDConfig.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2022 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //===- AMDConfig.h - AMD CodeGen Configurations ---------------------------===//
 //
 // This file contains CodeGen configurations for AMD GPUs.
 //
 //===----------------------------------------------------------------------===//

 #include "iree/compiler/Codegen/Dialect/LoweringConfig.h"
 #include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/BuiltinOps.h"

 #define DEBUG_TYPE "iree-spirv-amd-config"

 namespace mlir {
 namespace iree_compiler {
 namespace detail {

 constexpr unsigned AMDSimtSoftwarePipelineDepth = 2;
 constexpr unsigned AMDSimtSoftwarePipelineStoreStage = 0;

 constexpr unsigned AMDCoopMatrixSoftwarePipelineDepth = 1;
 constexpr unsigned AMDCoopMatrixSoftwarePipelineStoreStage = 0;

 constexpr unsigned AMDNumSubgroupsPerWorkgroup = 4;
 // The number of tiles along M and N dimensions per workgroup.
 constexpr unsigned AMDNumMNTilesPerSubgroup = 8;

 static LogicalResult setAMDMatmulConfig(linalg::LinalgOp op,
                                         const spirv::TargetEnv &targetEnv) {
   if (succeeded(setCooperativeMatrixConfig(
           targetEnv, op, AMDNumSubgroupsPerWorkgroup, AMDNumMNTilesPerSubgroup,
           AMDCoopMatrixSoftwarePipelineDepth,
           AMDCoopMatrixSoftwarePipelineStoreStage)))
     return success();

   spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
   const int subgroupSize = limits.getSubgroupSize();
   const std::array<int64_t, 2> workgroupXY = {subgroupSize / 2, 8};
   std::array<int64_t, 3> threadMNK;
   auto inputType = op.getDpsInputOperand(0)->get().getType().cast<ShapedType>();
   if (inputType.getElementType().getIntOrFloatBitWidth() == 16) {
     threadMNK = {8, 8, 32};
   } else {
     threadMNK = {8, 4, 16};
   }
   return setMatmulOpConfig(limits, op, workgroupXY, threadMNK,
                            /*enablePromotion=*/true,
                            AMDSimtSoftwarePipelineDepth,
                            AMDSimtSoftwarePipelineStoreStage);
 }

 // RDNA architecture:
 // https://gpuopen.com/wp-content/uploads/2019/08/RDNA_Architecture_public.pdf
 //
 // Workgroup Processor (WGP) is the block for workgroups in RDNA; it has its own
 // instruction/constant cache, L0 cache x2, Local Data Share (LDS, a.k.a. shared
 // memory), SALU x4, SIMD32 x4.
 //
 // * 1024 registers per SIMD32
 // * 128KB LDS per WGP
 // * Max 20 waves per SIMD32
 // * Max 64KB LDS per workgroup

 LogicalResult setAMDCodeGenConfig(const spirv::TargetEnv &targetEnv,
                                   Operation *rootOp) {
   spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
   int subgroupSize = limits.getSubgroupSize();

   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(rootOp)) {
     if (isMatmulOrBatchMatmul(linalgOp))
       return setAMDMatmulConfig(linalgOp, targetEnv);
   }

   if (auto convOp = dyn_cast<linalg::ConvolutionOpInterface>(rootOp)) {
     auto type = cast<ShapedType>(convOp.image().getType());
     const int bitwidth = type.getElementTypeBitWidth();
     if (bitwidth > 32) return failure();
     const int multipler = 32 / bitwidth;
     bool hasPaddedInput = convOp.image().getDefiningOp<tensor::PadOp>();
     const int bestTilingFactor = (hasPaddedInput ? 16 : 32) * multipler;
     return setConvOpConfig(rootOp, subgroupSize, bestTilingFactor);
   }

   return failure();
 }

 }  // namespace detail
 }  // namespace iree_compiler
 }  // namespace mlir
	// Copyright 2022 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	//===- AMDConfig.h - AMD CodeGen Configurations ---------------------------===//
	//
	// This file contains CodeGen configurations for AMD GPUs.
	//
	//===----------------------------------------------------------------------===//

	#include "iree/compiler/Codegen/Dialect/LoweringConfig.h"
	#include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
	#include "iree/compiler/Codegen/Utils/Utils.h"
	#include "llvm/Support/Debug.h"
	#include "mlir/Dialect/Linalg/IR/Linalg.h"
	#include "mlir/Dialect/SPIRV/IR/SPIRVAttributes.h"
	#include "mlir/Dialect/Tensor/IR/Tensor.h"
	#include "mlir/IR/BuiltinOps.h"

	#define DEBUG_TYPE "iree-spirv-amd-config"

	namespace mlir {
	namespace iree_compiler {
	namespace detail {

	constexpr unsigned AMDSimtSoftwarePipelineDepth = 2;
	constexpr unsigned AMDSimtSoftwarePipelineStoreStage = 0;

	constexpr unsigned AMDCoopMatrixSoftwarePipelineDepth = 1;
	constexpr unsigned AMDCoopMatrixSoftwarePipelineStoreStage = 0;

	constexpr unsigned AMDNumSubgroupsPerWorkgroup = 4;
	// The number of tiles along M and N dimensions per workgroup.
	constexpr unsigned AMDNumMNTilesPerSubgroup = 8;

	static LogicalResult setAMDMatmulConfig(linalg::LinalgOp op,
	const spirv::TargetEnv &targetEnv) {
	if (succeeded(setCooperativeMatrixConfig(
	targetEnv, op, AMDNumSubgroupsPerWorkgroup, AMDNumMNTilesPerSubgroup,
	AMDCoopMatrixSoftwarePipelineDepth,
	AMDCoopMatrixSoftwarePipelineStoreStage)))
	return success();

	spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
	const int subgroupSize = limits.getSubgroupSize();
	const std::array<int64_t, 2> workgroupXY = {subgroupSize / 2, 8};
	std::array<int64_t, 3> threadMNK;
	auto inputType = op.getDpsInputOperand(0)->get().getType().cast<ShapedType>();
	if (inputType.getElementType().getIntOrFloatBitWidth() == 16) {
	threadMNK = {8, 8, 32};
	} else {
	threadMNK = {8, 4, 16};
	}
	return setMatmulOpConfig(limits, op, workgroupXY, threadMNK,
	/enablePromotion=/true,
	AMDSimtSoftwarePipelineDepth,
	AMDSimtSoftwarePipelineStoreStage);
	}

	// RDNA architecture:
	// https://gpuopen.com/wp-content/uploads/2019/08/RDNA_Architecture_public.pdf
	//
	// Workgroup Processor (WGP) is the block for workgroups in RDNA; it has its own
	// instruction/constant cache, L0 cache x2, Local Data Share (LDS, a.k.a. shared
	// memory), SALU x4, SIMD32 x4.
	//
	// * 1024 registers per SIMD32
	// * 128KB LDS per WGP
	// * Max 20 waves per SIMD32
	// * Max 64KB LDS per workgroup

	LogicalResult setAMDCodeGenConfig(const spirv::TargetEnv &targetEnv,
	Operation *rootOp) {
	spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
	int subgroupSize = limits.getSubgroupSize();

	if (auto linalgOp = dyn_cast<linalg::LinalgOp>(rootOp)) {
	if (isMatmulOrBatchMatmul(linalgOp))
	return setAMDMatmulConfig(linalgOp, targetEnv);
	}

	if (auto convOp = dyn_cast<linalg::ConvolutionOpInterface>(rootOp)) {
	auto type = cast<ShapedType>(convOp.image().getType());
	const int bitwidth = type.getElementTypeBitWidth();
	if (bitwidth > 32) return failure();
	const int multipler = 32 / bitwidth;
	bool hasPaddedInput = convOp.image().getDefiningOp<tensor::PadOp>();
	const int bestTilingFactor = (hasPaddedInput ? 16 : 32) * multipler;
	return setConvOpConfig(rootOp, subgroupSize, bestTilingFactor);
	}

	return failure();
	}

	} // namespace detail
	} // namespace iree_compiler
	} // namespace mlir