Refactor Model builder MatMul GPU benchmark to support multiple strategy (#3591)
diff --git a/experimental/ModelBuilder/test/BenchMatMulVectorGPU.cpp b/experimental/ModelBuilder/test/BenchMatMulVectorGPU.cpp
index 07d720a..291ccb6 100644
--- a/experimental/ModelBuilder/test/BenchMatMulVectorGPU.cpp
+++ b/experimental/ModelBuilder/test/BenchMatMulVectorGPU.cpp
@@ -67,13 +67,17 @@
static llvm::cl::opt<bool> enableLICM(
"enable-licm",
llvm::cl::desc("Enable loop invariant hoisting optimizations"),
- llvm::cl::value_desc("boolean"), llvm::cl::init(false));
+ llvm::cl::value_desc("boolean"), llvm::cl::init(true));
static llvm::cl::opt<std::string> matType("matrix-type",
llvm::cl::desc("Matrix element type"),
llvm::cl::value_desc("type"),
llvm::cl::init("i8xi8xi32"));
+static llvm::cl::opt<std::string> target(
+ "target", llvm::cl::desc("Platform target to decide the strategy"),
+ llvm::cl::value_desc("type"), llvm::cl::init(""));
+
static void addLoweringPasses(mlir::PassManager &pm,
llvm::ArrayRef<int64_t> numWorkgroups,
llvm::ArrayRef<Type> args) {
@@ -245,10 +249,161 @@
return a == b;
}
+static MatmulCodegenStrategy createPowerVRStrategy(int tileM, int tileN,
+ int tileK, int warpSize) {
+ const std::array<int64_t, 3> nativeSize = {1, 1, 1};
+ linalg::LinalgLoopDistributionOptions WIDistribute;
+ linalg::LinalgLoopDistributionOptions WGDistribute;
+ WGDistribute.distributionMethod = {
+ linalg::DistributionMethod::CyclicNumProcsEqNumIters,
+ linalg::DistributionMethod::CyclicNumProcsEqNumIters};
+ WGDistribute.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+
+ WIDistribute.distributionMethod = {
+ linalg::DistributionMethod::CyclicNumProcsEqNumIters,
+ linalg::DistributionMethod::CyclicNumProcsEqNumIters};
+ WIDistribute.procInfo = [warpSize](OpBuilder &b, Location loc,
+ ArrayRef<Range> parallelLoopRanges) {
+ Type indexType = b.getIndexType();
+ SmallVector<linalg::ProcInfo, 2> procInfo(2);
+ procInfo[0] = {
+ b.create<gpu::ThreadIdOp>(loc, indexType, b.getStringAttr("x")),
+ b.create<ConstantIndexOp>(loc, warpSize)};
+ procInfo[1] = {b.create<ConstantIndexOp>(loc, 0),
+ b.create<ConstantIndexOp>(loc, 1)};
+ return procInfo;
+ };
+ MatmulCodegenStrategy strategy;
+ SmallVector<int64_t, 2> promotionList;
+ // promote matrix B
+ promotionList.push_back(1);
+ strategy
+ .tile<linalg::MatmulOp>(
+ linalg::LinalgTilingOptions()
+ .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
+ .setTileSizes({tileM, tileN, tileK})
+ .setDistributionOptions(WGDistribute))
+ .setHoistInvariantCode(enableLICM);
+ if (useWorkgroupMemory) {
+ strategy.promote<linalg::MatmulOp>(
+ linalg::LinalgPromotionOptions()
+ .setAllocationDeallocationFns(
+ mlir::iree_compiler::allocateWorkgroupMemory,
+ mlir::iree_compiler::deallocateWorkgroupMemory)
+ .setCopyInOutFns(mlir::iree_compiler::copyToWorkgroupMemory,
+ mlir::iree_compiler::copyToWorkgroupMemory)
+ .setOperandsToPromote(promotionList)
+ .setUseFullTileBuffers({false, false}));
+ }
+ strategy.tile<linalg::MatmulOp>(
+ linalg::LinalgTilingOptions()
+ .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
+ .setTileSizes({1, tileN, tileK})
+ .setDistributionOptions(WIDistribute));
+ strategy.vectorize<linalg::MatmulOp>().unrollVector<vector::ContractionOp>(
+ nativeSize);
+ return strategy;
+}
+
+static MatmulCodegenStrategy createMaliStrategy(int tileM, int tileN, int tileK,
+ int warpSize) {
+ const std::array<int64_t, 3> nativeSize = {1, 4, 1};
+ linalg::LinalgLoopDistributionOptions WIDistribute;
+ linalg::LinalgLoopDistributionOptions WGDistribute;
+ WGDistribute.distributionMethod = {
+ linalg::DistributionMethod::CyclicNumProcsEqNumIters,
+ linalg::DistributionMethod::CyclicNumProcsEqNumIters};
+ WGDistribute.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+
+ WIDistribute.distributionMethod = {
+ linalg::DistributionMethod::CyclicNumProcsEqNumIters,
+ linalg::DistributionMethod::CyclicNumProcsEqNumIters};
+ WIDistribute.procInfo = [warpSize](OpBuilder &b, Location loc,
+ ArrayRef<Range> parallelLoopRanges) {
+ Type indexType = b.getIndexType();
+ SmallVector<linalg::ProcInfo, 2> procInfo(2);
+ procInfo[1] = {
+ b.create<gpu::ThreadIdOp>(loc, indexType, b.getStringAttr("x")),
+ b.create<ConstantIndexOp>(loc, warpSize)};
+ procInfo[0] = {b.create<ConstantIndexOp>(loc, 0),
+ b.create<ConstantIndexOp>(loc, 1)};
+ return procInfo;
+ };
+ MatmulCodegenStrategy strategy;
+ strategy
+ .tile<linalg::MatmulOp>(
+ linalg::LinalgTilingOptions()
+ .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
+ .setTileSizes({tileM, tileN, tileK})
+ .setDistributionOptions(WGDistribute))
+ .setHoistInvariantCode(enableLICM);
+ strategy.tile<linalg::MatmulOp>(
+ linalg::LinalgTilingOptions()
+ .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
+ .setTileSizes({tileM, tileN / warpSize, tileK})
+ .setDistributionOptions(WIDistribute));
+ strategy.vectorize<linalg::MatmulOp>().unrollVector<vector::ContractionOp>(
+ nativeSize);
+ return strategy;
+}
+
+static MatmulCodegenStrategy createTuringStrategy(int tileM, int tileN,
+ int tileK) {
+ std::array<int64_t, 3> nativeSize;
+ if (matType == "i8xi8xi32")
+ nativeSize = {16, 16, 32};
+ else if (matType == "f16xf16xf16")
+ nativeSize = {16, 16, 16};
+ else if (matType == "f16xf16xf32")
+ nativeSize = {16, 16, 16};
+ else
+ llvm::errs() << "unsupported matrix type";
+ linalg::LinalgLoopDistributionOptions WGDistribute;
+ WGDistribute.distributionMethod = {
+ linalg::DistributionMethod::CyclicNumProcsEqNumIters,
+ linalg::DistributionMethod::CyclicNumProcsEqNumIters};
+ WGDistribute.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
+
+ linalg::LinalgLoopDistributionOptions SGDistribute;
+ SGDistribute.distributionMethod = {
+ linalg::DistributionMethod::CyclicNumProcsEqNumIters,
+ linalg::DistributionMethod::CyclicNumProcsEqNumIters};
+ SGDistribute.procInfo = getSubgroupIds;
+
+ MatmulCodegenStrategy strategy;
+ strategy
+ .tile<linalg::MatmulOp>(
+ linalg::LinalgTilingOptions()
+ .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
+ .setTileSizes({tileM, tileN, tileK})
+ .setDistributionOptions(WGDistribute))
+ .setHoistInvariantCode(enableLICM);
+ if (useWorkgroupMemory) {
+ strategy
+ .promote<linalg::MatmulOp>(
+ linalg::LinalgPromotionOptions()
+ .setAllocationDeallocationFns(
+ mlir::iree_compiler::allocateWorkgroupMemory,
+ mlir::iree_compiler::deallocateWorkgroupMemory)
+ .setCopyInOutFns(mlir::iree_compiler::copyToWorkgroupMemory,
+ mlir::iree_compiler::copyToWorkgroupMemory)
+ .setOperandsToPromote({0, 1})
+ .setUseFullTileBuffers({false, false}))
+ .tile<linalg::MatmulOp>(
+ linalg::LinalgTilingOptions()
+ .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
+ .setTileSizes(
+ {tileM / numSubgroupY, tileN / numSubgroupX, tileK})
+ .setDistributionOptions(SGDistribute));
+ }
+ strategy.vectorize<linalg::MatmulOp>().unrollVector<vector::ContractionOp>(
+ nativeSize);
+ return strategy;
+}
+
template <typename SrcT, typename DstT>
static void matMul(int m, int n, int k, int tileM, int tileN, int tileK,
- const std::array<int64_t, 3> &nativeSize, bool correctness) {
- const int warpSize = 32;
+ bool correctness, int warpSize) {
const int resRows = m;
const int resColumns = n;
const int reductionSize = k;
@@ -294,45 +449,13 @@
options.loweringPasses = [&](mlir::PassManager &pm) {
MatmulCodegenStrategy strategy;
- linalg::LinalgLoopDistributionOptions WGDistribute;
- WGDistribute.distributionMethod = {
- linalg::DistributionMethod::CyclicNumProcsEqNumIters,
- linalg::DistributionMethod::CyclicNumProcsEqNumIters};
- WGDistribute.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
-
- linalg::LinalgLoopDistributionOptions SGDistribute;
- SGDistribute.distributionMethod = {
- linalg::DistributionMethod::CyclicNumProcsEqNumIters,
- linalg::DistributionMethod::CyclicNumProcsEqNumIters};
- SGDistribute.procInfo = getSubgroupIds;
-
- strategy
- .tile<linalg::MatmulOp>(
- linalg::LinalgTilingOptions()
- .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
- .setTileSizes({tileM, tileN, tileK})
- .setDistributionOptions(WGDistribute))
- .setHoistInvariantCode(enableLICM);
- if (useWorkgroupMemory) {
- strategy
- .promote<linalg::MatmulOp>(
- linalg::LinalgPromotionOptions()
- .setAllocationDeallocationFns(
- mlir::iree_compiler::allocateWorkgroupMemory,
- mlir::iree_compiler::deallocateWorkgroupMemory)
- .setCopyInOutFns(mlir::iree_compiler::copyToWorkgroupMemory,
- mlir::iree_compiler::copyToWorkgroupMemory)
- .setOperandsToPromote({0, 1})
- .setUseFullTileBuffers({false, false}))
- .tile<linalg::MatmulOp>(
- linalg::LinalgTilingOptions()
- .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
- .setTileSizes(
- {tileM / numSubgroupY, tileN / numSubgroupX, tileK})
- .setDistributionOptions(SGDistribute));
+ if (target == "powerVR") {
+ strategy = createPowerVRStrategy(tileM, tileN, tileK, warpSize);
+ } else if (target == "NVTuring") {
+ strategy = createTuringStrategy(tileM, tileN, tileK);
+ } else if (target == "mali") {
+ strategy = createMaliStrategy(tileM, tileN, tileK, warpSize);
}
- strategy.vectorize<linalg::MatmulOp>().unrollVector<vector::ContractionOp>(
- nativeSize);
modelBuilder.getModuleRef()->walk(
[&](FuncOp fn) { strategy.transform(fn); });
addLoweringPasses(pm, {resColumns / tileN, resRows / tileM, 1},
@@ -394,27 +517,22 @@
}
static void matMul(int m, int n, int k, int tileM, int tileN, int tileK,
- bool correctness) {
- std::array<int64_t, 3> nativeMatSize;
+ bool correctness, int warpSize) {
if (matType == "i8xi8xi32") {
- nativeMatSize = {16, 16, 32};
return matMul<MatMulI8, MatMulI32>(m, n, k, tileM, tileN, tileK,
- nativeMatSize, correctness);
+ correctness, warpSize);
}
if (matType == "f16xf16xf16") {
- nativeMatSize = {16, 16, 16};
return matMul<MatMulF16, MatMulF16>(m, n, k, tileM, tileN, tileK,
- nativeMatSize, correctness);
+ correctness, warpSize);
}
if (matType == "f16xf16xf32") {
- nativeMatSize = {16, 16, 16};
return matMul<MatMulF16, MatMulF32>(m, n, k, tileM, tileN, tileK,
- nativeMatSize, correctness);
+ correctness, warpSize);
}
if (matType == "f32xf32xf32") {
- nativeMatSize = {1, 1, 1};
return matMul<MatMulF32, MatMulF32>(m, n, k, tileM, tileN, tileK,
- nativeMatSize, correctness);
+ correctness, warpSize);
}
llvm_unreachable("Unsupported matrix type");
}
@@ -425,6 +543,10 @@
// test specific option for a runtime support library.
llvm::InitLLVM y(argc, argv);
llvm::cl::ParseCommandLineOptions(argc, argv, "BenchMatMulVectorGPU\n");
+ if (target.empty()) {
+ llvm::errs() << "No target specified.";
+ return 0;
+ }
int m = 4096;
int n = 4096;
int k = 4096;
@@ -433,13 +555,42 @@
n = 256;
k = 256;
}
+ int warpSize = 32;
+ std::pair<int, int> tileMRange;
+ std::pair<int, int> tileNRange;
+ std::pair<int, int> tileKRange;
+ if (target == "powerVR") {
+ m = std::max(m, 1024);
+ n = std::max(n, 1024);
+ k = std::max(k, 1024);
+ tileMRange = {32, 32};
+ tileNRange = {32, 32};
+ tileKRange = {4, 4};
+ } else if (target == "NVTuring") {
+ tileMRange = {32, 256};
+ tileNRange = {32, 256};
+ tileKRange = {32, 64};
+ // Workgroup memory requires at least a tile size of 128x128 to be able
+ // to do full speed copy from video memory to shared local memory.
+ if (useWorkgroupMemory) {
+ tileMRange.first = 128;
+ tileNRange.first = 128;
+ }
+ } else if (target == "mali") {
+ warpSize = 16;
+ tileMRange = {1, 8};
+ tileNRange = {64, 128};
+ tileKRange = {4, 4};
+ } else {
+ llvm::errs() << "Unknown target";
+ return 0;
+ }
+
printf("Matrix size: %ix%ix%i\n", m, n, k);
- for (int tileK = 32; tileK <= 64; tileK *= 2) {
- for (int tileM = 32; tileM <= 256; tileM *= 2) {
- for (int tileN = 32; tileN <= 256; tileN *= 2) {
- // Workgroup memory requires at least a tile size of 128x128 to be able
- // to do full speed copy from video memory to shared local memory.
- if (useWorkgroupMemory && (tileM < 128 || tileN < 128)) continue;
+ for (int tileK = tileKRange.first; tileK <= tileKRange.second; tileK *= 2) {
+ for (int tileM = tileMRange.first; tileM <= tileMRange.second; tileM *= 2) {
+ for (int tileN = tileNRange.first; tileN <= tileNRange.second;
+ tileN *= 2) {
printf("tileM=%i tileN=%i tileK=%i\n", tileM, tileN, tileK);
// For non-power of two tile sizes, round up the matrix size to
// be an even multiple of the tile size.
@@ -449,7 +600,8 @@
auto paddedN = (n + tileN - 1) / tileN * tileN;
auto paddedK = (k + tileK - 1) / tileK * tileK;
- matMul(paddedM, paddedN, paddedK, tileM, tileN, tileK, correctness);
+ matMul(paddedM, paddedN, paddedK, tileM, tileN, tileK, correctness,
+ warpSize);
}
}
}
diff --git a/iree/compiler/Conversion/CodegenUtils/MatmulCodegenStrategy.h b/iree/compiler/Conversion/CodegenUtils/MatmulCodegenStrategy.h
index dec79dd..1a3ad7b 100644
--- a/iree/compiler/Conversion/CodegenUtils/MatmulCodegenStrategy.h
+++ b/iree/compiler/Conversion/CodegenUtils/MatmulCodegenStrategy.h
@@ -42,7 +42,7 @@
template <typename VectorOpType>
struct UnrollVector : public Transformation {
explicit UnrollVector(ArrayRef<int64_t> targetShape)
- : targetShape(targetShape) {}
+ : targetShape(targetShape.begin(), targetShape.end()) {}
OwningRewritePatternList buildRewritePatterns(
MLIRContext *ctx, linalg::LinalgMarker m) override {
@@ -57,7 +57,7 @@
}
private:
- ArrayRef<int64_t> targetShape;
+ SmallVector<int64_t, 4> targetShape;
};
/// Promotion transformation enqueues a particular stage-1 pattern for