iree/compiler/Codegen/SPIRV/MaliConfig.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2021 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //===- MaliConfig.h - Mali CodeGen Configurations -------------------------===//
 //
 // This file contains CodeGen configurations for Mali GPUs.
 //
 //===----------------------------------------------------------------------===//

 #include <array>

 #include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"

 namespace mlir {
 namespace iree_compiler {
 namespace detail {

 struct TileWorkgroupSizePair {
   // How many scalar elements each workgroup should handle along each dimension.
   std::array<int64_t, 3> tileSize;
   // The number of threads per workgroup along each dimension.
   std::array<int64_t, 3> workgroupSize;
 };

 //===----------------------------------------------------------------------===//
 // Matmul
 //===----------------------------------------------------------------------===//

 /// Writes preferred matmul workgroup tile sizes and workgroup size into
 /// `pairs` for the given matmul `scale` (MxNxK) and `elementType`.
 static void getMatmulTileAndWorkgroupSizes(
     int64_t scale, Type elementType,
     SmallVectorImpl<TileWorkgroupSizePair> &pairs) {
   if (elementType.isF16()) {
     const int64_t smallMatrixSizeThreshold = 512 * 512;
     // For smaller destination size we cannot fill out the GPU with bigger tile
     // sizes. Instead we pick smaller tiles along M and N to increase the number
     // of workgroups and a larger K tile size since we have lower pressure and
     // need extra instructions to hide latency.
     // TODO: The threshold needs to be fine tuned by doing exploration based on
     // matrix shapes.
     if (scale <= smallMatrixSizeThreshold) {
       pairs.push_back(TileWorkgroupSizePair({{16, 32, 16}, {8, 2, 1}}));
     } else {
       pairs.push_back(TileWorkgroupSizePair({{16, 64, 4}, {8, 2, 1}}));
       pairs.push_back(TileWorkgroupSizePair({{8, 128, 4}, {8, 2, 1}}));
       pairs.push_back(TileWorkgroupSizePair({{16, 32, 4}, {8, 2, 1}}));
     }
     return;
   }

   // TODO: Heuristic picked based on MobileNet performance. We need
   // auto-tuning to be able to make a smarter choice.
   const int64_t smallMatrixSizeThreshold = 20000;

   if (scale <= smallMatrixSizeThreshold) {
     pairs.push_back(TileWorkgroupSizePair({{4, 32, 16}, {8, 2, 1}}));
   }
   pairs.push_back(TileWorkgroupSizePair({{12, 32, 4}, {8, 2, 1}}));
   pairs.push_back(TileWorkgroupSizePair({{14, 32, 4}, {8, 2, 1}}));
   pairs.push_back(TileWorkgroupSizePair({{10, 32, 4}, {8, 2, 1}}));
   pairs.push_back(TileWorkgroupSizePair({{7, 64, 4}, {16, 1, 1}}));
   pairs.push_back(TileWorkgroupSizePair({{8, 32, 4}, {8, 2, 1}}));
   pairs.push_back(TileWorkgroupSizePair({{6, 32, 4}, {8, 2, 1}}));
   pairs.push_back(TileWorkgroupSizePair({{24, 16, 4}, {2, 8, 1}}));
   pairs.push_back(TileWorkgroupSizePair({{16, 16, 4}, {2, 8, 1}}));
   pairs.push_back(TileWorkgroupSizePair({{24, 8, 4}, {2, 8, 1}}));
   pairs.push_back(TileWorkgroupSizePair({{40, 8, 4}, {2, 8, 1}}));
   pairs.push_back(TileWorkgroupSizePair({{32, 8, 4}, {2, 8, 1}}));
   pairs.push_back(TileWorkgroupSizePair({{16, 8, 4}, {2, 8, 1}}));
   pairs.push_back(TileWorkgroupSizePair({{1, 32, 16}, {8, 1, 1}}));
   pairs.push_back(TileWorkgroupSizePair({{1, 32, 8}, {8, 1, 1}}));
   pairs.push_back(TileWorkgroupSizePair({{1, 32, 4}, {8, 1, 1}}));
 }

 /// Launch configuration for Mali GPU configuration.
 static LogicalResult setOpConfig(linalg::BatchMatmulOp op) {
   ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
   ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);

   if (llvm::any_of(lhsShape, ShapedType::isDynamic) ||
       llvm::any_of(rhsShape, ShapedType::isDynamic)) {
     return success();
   }

   // Get a vector of best tile size ordered from best to worst.
   Type elementType =
       op.inputs()[0].getType().cast<ShapedType>().getElementType();
   int64_t matmulScale = lhsShape[0] * lhsShape[1] * rhsShape[2];
   SmallVector<TileWorkgroupSizePair, 4> pairs;
   getMatmulTileAndWorkgroupSizes(matmulScale, elementType, pairs);

   for (TileWorkgroupSizePair pair : pairs) {
     if (lhsShape[1] % pair.tileSize[0] != 0 ||
         rhsShape[2] % pair.tileSize[1] != 0 ||
         lhsShape[2] % pair.tileSize[2] != 0) {
       continue;
     }

     auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;

     SmallVector<int64_t, 4> numElementsPerWorkgroup;
     numElementsPerWorkgroup = {1, pair.tileSize[0], pair.tileSize[1],
                                pair.tileSize[2]};

     TileSizesListType tileSizes;
     // Workgroup level.
     tileSizes.push_back(numElementsPerWorkgroup);
     // No tiling at the subgroup level since this target doesn't use subgroup op
     // or shared memory.
     tileSizes.emplace_back();
     // Invocation level.
     tileSizes.push_back({numElementsPerWorkgroup[0],
                          numElementsPerWorkgroup[1] / pair.workgroupSize[1],
                          numElementsPerWorkgroup[2] / pair.workgroupSize[0],
                          numElementsPerWorkgroup[3]});

     return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
                                                  op, tileSizes, {}, pipeline,
                                                  pair.workgroupSize);
   }
   return success();
 }

 static LogicalResult setOpConfig(linalg::MatmulOp op) {
   ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
   ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);

   if (llvm::any_of(lhsShape, ShapedType::isDynamic) ||
       llvm::any_of(rhsShape, ShapedType::isDynamic)) {
     return success();
   }

   Type elementType =
       op.inputs()[0].getType().cast<ShapedType>().getElementType();
   int64_t matmulScale = lhsShape[0] * rhsShape[1];
   SmallVector<TileWorkgroupSizePair, 4> pairs;
   getMatmulTileAndWorkgroupSizes(matmulScale, elementType, pairs);

   for (TileWorkgroupSizePair pair : pairs) {
     if (lhsShape[0] % pair.tileSize[0] != 0 ||
         rhsShape[1] % pair.tileSize[1] != 0 ||
         lhsShape[1] % pair.tileSize[2] != 0) {
       continue;
     }

     auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;

     SmallVector<int64_t, 4> numElementsPerWorkgroup(pair.tileSize.begin(),
                                                     pair.tileSize.end());

     TileSizesListType tileSizes;
     // Workgroup level.
     tileSizes.push_back(numElementsPerWorkgroup);
     // No tiling at the subgroup level since this target doesn't use subgroup op
     // or shared memory.
     tileSizes.emplace_back();
     // Invocation level.
     tileSizes.push_back({numElementsPerWorkgroup[0] / pair.workgroupSize[1],
                          numElementsPerWorkgroup[1] / pair.workgroupSize[0],
                          numElementsPerWorkgroup[2]});

     return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
                                                  op, tileSizes, {}, pipeline,
                                                  pair.workgroupSize);
   }
   return success();
 }

 //===----------------------------------------------------------------------===//
 // Entry Point
 //===----------------------------------------------------------------------===//

 LogicalResult setMaliCodeGenConfig(const spirv::TargetEnv &targetEnv,
                                    Operation *rootOp) {
   int64_t subgroupSize = targetEnv.getResourceLimits().subgroup_size().getInt();
   return TypeSwitch<Operation *, LogicalResult>(rootOp)
       .Case<linalg::BatchMatmulOp, linalg::MatmulOp>(
           [](auto op) { return setOpConfig(op); })
       .Case<linalg::Conv2DNhwcHwcfOp>([subgroupSize](auto op) {
         return setConvOpConfig(op, subgroupSize,
                                /*bestTilingFactor=*/16);
       })
       .Case<linalg::DepthwiseConv2DNhwOp>([subgroupSize](auto op) {
         return setConvOpConfig(op, subgroupSize,
                                /*bestTilingFactor=*/16);
       })
       .Default([](Operation *) { return success(); });
 }

 }  // namespace detail
 }  // namespace iree_compiler
 }  // namespace mlir
	// Copyright 2021 The IREE Authors
	//
	// Licensed under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	//===- MaliConfig.h - Mali CodeGen Configurations -------------------------===//
	//
	// This file contains CodeGen configurations for Mali GPUs.
	//
	//===----------------------------------------------------------------------===//

	#include <array>

	#include "iree/compiler/Codegen/SPIRV/KernelConfig.h"
	#include "iree/compiler/Codegen/Transforms/Transforms.h"
	#include "iree/compiler/Codegen/Utils/Utils.h"
	#include "mlir/Dialect/Linalg/IR/LinalgOps.h"

	namespace mlir {
	namespace iree_compiler {
	namespace detail {

	struct TileWorkgroupSizePair {
	// How many scalar elements each workgroup should handle along each dimension.
	std::array<int64_t, 3> tileSize;
	// The number of threads per workgroup along each dimension.
	std::array<int64_t, 3> workgroupSize;
	};

	//===----------------------------------------------------------------------===//
	// Matmul
	//===----------------------------------------------------------------------===//

	/// Writes preferred matmul workgroup tile sizes and workgroup size into
	/// `pairs` for the given matmul `scale` (MxNxK) and `elementType`.
	static void getMatmulTileAndWorkgroupSizes(
	int64_t scale, Type elementType,
	SmallVectorImpl<TileWorkgroupSizePair> &pairs) {
	if (elementType.isF16()) {
	const int64_t smallMatrixSizeThreshold = 512 * 512;
	// For smaller destination size we cannot fill out the GPU with bigger tile
	// sizes. Instead we pick smaller tiles along M and N to increase the number
	// of workgroups and a larger K tile size since we have lower pressure and
	// need extra instructions to hide latency.
	// TODO: The threshold needs to be fine tuned by doing exploration based on
	// matrix shapes.
	if (scale <= smallMatrixSizeThreshold) {
	pairs.push_back(TileWorkgroupSizePair({{16, 32, 16}, {8, 2, 1}}));
	} else {
	pairs.push_back(TileWorkgroupSizePair({{16, 64, 4}, {8, 2, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{8, 128, 4}, {8, 2, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{16, 32, 4}, {8, 2, 1}}));
	}
	return;
	}

	// TODO: Heuristic picked based on MobileNet performance. We need
	// auto-tuning to be able to make a smarter choice.
	const int64_t smallMatrixSizeThreshold = 20000;

	if (scale <= smallMatrixSizeThreshold) {
	pairs.push_back(TileWorkgroupSizePair({{4, 32, 16}, {8, 2, 1}}));
	}
	pairs.push_back(TileWorkgroupSizePair({{12, 32, 4}, {8, 2, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{14, 32, 4}, {8, 2, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{10, 32, 4}, {8, 2, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{7, 64, 4}, {16, 1, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{8, 32, 4}, {8, 2, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{6, 32, 4}, {8, 2, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{24, 16, 4}, {2, 8, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{16, 16, 4}, {2, 8, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{24, 8, 4}, {2, 8, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{40, 8, 4}, {2, 8, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{32, 8, 4}, {2, 8, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{16, 8, 4}, {2, 8, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{1, 32, 16}, {8, 1, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{1, 32, 8}, {8, 1, 1}}));
	pairs.push_back(TileWorkgroupSizePair({{1, 32, 4}, {8, 1, 1}}));
	}

	/// Launch configuration for Mali GPU configuration.
	static LogicalResult setOpConfig(linalg::BatchMatmulOp op) {
	ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
	ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);

	if (llvm::any_of(lhsShape, ShapedType::isDynamic) \|\|
	llvm::any_of(rhsShape, ShapedType::isDynamic)) {
	return success();
	}

	// Get a vector of best tile size ordered from best to worst.
	Type elementType =
	op.inputs()[0].getType().cast<ShapedType>().getElementType();
	int64_t matmulScale = lhsShape[0] * lhsShape[1] * rhsShape[2];
	SmallVector<TileWorkgroupSizePair, 4> pairs;
	getMatmulTileAndWorkgroupSizes(matmulScale, elementType, pairs);

	for (TileWorkgroupSizePair pair : pairs) {
	if (lhsShape[1] % pair.tileSize[0] != 0 \|\|
	rhsShape[2] % pair.tileSize[1] != 0 \|\|
	lhsShape[2] % pair.tileSize[2] != 0) {
	continue;
	}

	auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;

	SmallVector<int64_t, 4> numElementsPerWorkgroup;
	numElementsPerWorkgroup = {1, pair.tileSize[0], pair.tileSize[1],
	pair.tileSize[2]};

	TileSizesListType tileSizes;
	// Workgroup level.
	tileSizes.push_back(numElementsPerWorkgroup);
	// No tiling at the subgroup level since this target doesn't use subgroup op
	// or shared memory.
	tileSizes.emplace_back();
	// Invocation level.
	tileSizes.push_back({numElementsPerWorkgroup[0],
	numElementsPerWorkgroup[1] / pair.workgroupSize[1],
	numElementsPerWorkgroup[2] / pair.workgroupSize[0],
	numElementsPerWorkgroup[3]});

	return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
	op, tileSizes, {}, pipeline,
	pair.workgroupSize);
	}
	return success();
	}

	static LogicalResult setOpConfig(linalg::MatmulOp op) {
	ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
	ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);

	if (llvm::any_of(lhsShape, ShapedType::isDynamic) \|\|
	llvm::any_of(rhsShape, ShapedType::isDynamic)) {
	return success();
	}

	Type elementType =
	op.inputs()[0].getType().cast<ShapedType>().getElementType();
	int64_t matmulScale = lhsShape[0] * rhsShape[1];
	SmallVector<TileWorkgroupSizePair, 4> pairs;
	getMatmulTileAndWorkgroupSizes(matmulScale, elementType, pairs);

	for (TileWorkgroupSizePair pair : pairs) {
	if (lhsShape[0] % pair.tileSize[0] != 0 \|\|
	rhsShape[1] % pair.tileSize[1] != 0 \|\|
	lhsShape[1] % pair.tileSize[2] != 0) {
	continue;
	}

	auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;

	SmallVector<int64_t, 4> numElementsPerWorkgroup(pair.tileSize.begin(),
	pair.tileSize.end());

	TileSizesListType tileSizes;
	// Workgroup level.
	tileSizes.push_back(numElementsPerWorkgroup);
	// No tiling at the subgroup level since this target doesn't use subgroup op
	// or shared memory.
	tileSizes.emplace_back();
	// Invocation level.
	tileSizes.push_back({numElementsPerWorkgroup[0] / pair.workgroupSize[1],
	numElementsPerWorkgroup[1] / pair.workgroupSize[0],
	numElementsPerWorkgroup[2]});

	return setOpConfigAndEntryPointFnTranslation(op->getParentOfType<FuncOp>(),
	op, tileSizes, {}, pipeline,
	pair.workgroupSize);
	}
	return success();
	}

	//===----------------------------------------------------------------------===//
	// Entry Point
	//===----------------------------------------------------------------------===//

	LogicalResult setMaliCodeGenConfig(const spirv::TargetEnv &targetEnv,
	Operation *rootOp) {
	int64_t subgroupSize = targetEnv.getResourceLimits().subgroup_size().getInt();
	return TypeSwitch<Operation *, LogicalResult>(rootOp)
	.Case<linalg::BatchMatmulOp, linalg::MatmulOp>(
	[](auto op) { return setOpConfig(op); })
	.Case<linalg::Conv2DNhwcHwcfOp>([subgroupSize](auto op) {
	return setConvOpConfig(op, subgroupSize,
	/bestTilingFactor=/16);
	})
	.Case<linalg::DepthwiseConv2DNhwOp>([subgroupSize](auto op) {
	return setConvOpConfig(op, subgroupSize,
	/bestTilingFactor=/16);
	})
	.Default([](Operation *) { return success(); });
	}

	} // namespace detail
	} // namespace iree_compiler
	} // namespace mlir