blob: 52f9128c4469ff3993411bff0ca9e5f985527f01 [file] [log] [blame]
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "experimental/ModelBuilder/ModelBuilder.h"
#include "experimental/ModelBuilder/ModelRunner.h"
#include "experimental/ModelBuilder/VulkanWrapperPass.h"
#include "iree/base/initializer.h"
#include "iree/compiler/Conversion/CodegenUtils/MatmulCodegenStrategy.h"
#include "iree/compiler/Conversion/LinalgToSPIRV/MemorySpace.h"
#include "iree/compiler/Conversion/LinalgToSPIRV/Passes.h"
#include "iree/compiler/Conversion/LinalgToSPIRV/Utils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/InitLLVM.h"
#include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"
#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
#include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h"
#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/SPIRV/Passes.h"
#include "mlir/Dialect/SPIRV/SPIRVOps.h"
#include "mlir/Dialect/SPIRV/TargetAndABI.h"
#include "mlir/Dialect/Vector/VectorOps.h"
#include "mlir/ExecutionEngine/CRunnerUtils.h"
#include "mlir/ExecutionEngine/RunnerUtils.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/MLIRContext.h"
#include "mlir/IR/OperationSupport.h"
#include "mlir/Parser.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/Passes.h"
using namespace mlir; // NOLINT
using namespace mlir::edsc; // NOLINT
using namespace mlir::edsc::intrinsics; // NOLINT
static llvm::cl::opt<std::string> vulkanWrapper(
"vulkan-wrapper", llvm::cl::desc("Vulkan wrapper library"),
llvm::cl::value_desc("filename"), llvm::cl::init("-"));
static llvm::cl::opt<bool> correctness(
"correctness",
llvm::cl::desc(
"Compare the result to value calculated on CPU. We will use a smaller "
"matrix multiply in this case to avoid long runtime."),
llvm::cl::init(false));
static llvm::cl::opt<bool> useWorkgroupMemory(
"use-workgroup-memory", llvm::cl::desc("Enable use of workgroup memory"),
llvm::cl::value_desc("boolean"), llvm::cl::init(false));
static llvm::cl::opt<bool> enableLICM(
"enable-licm",
llvm::cl::desc("Enable loop invariant hoisting optimizations"),
llvm::cl::value_desc("boolean"), llvm::cl::init(false));
static void addLoweringPasses(mlir::PassManager &pm,
llvm::ArrayRef<int64_t> numWorkgroups,
llvm::ArrayRef<Type> args) {
pm.addPass(mlir::iree_compiler::createVectorToGPUPass());
pm.addPass(mlir::createLowerAffinePass());
pm.addPass(mlir::createLegalizeStdOpsForSPIRVLoweringPass());
pm.addPass(mlir::createCanonicalizerPass());
pm.addPass(mlir::createCSEPass());
pm.addPass(mlir::iree_compiler::createConvertToSPIRVPass());
auto &spirvModulePM = pm.nest<mlir::spirv::ModuleOp>();
spirvModulePM.addPass(mlir::createSetSpirvABIPass());
spirvModulePM.addPass(mlir::spirv::createLowerABIAttributesPass());
spirvModulePM.addPass(mlir::createCanonicalizerPass());
spirvModulePM.addPass(mlir::createCSEPass());
spirvModulePM.addPass(
mlir::spirv::createUpdateVersionCapabilityExtensionPass());
pm.addPass(mlir::createAddVulkanLaunchWrapperPass(numWorkgroups, args));
mlir::LowerToLLVMOptions llvmOptions = {
/*useBarePtrCallConv=*/false,
/*emitCWrappers=*/true,
/*indexBitwidth=*/mlir::kDeriveIndexBitwidthFromDataLayout};
pm.addPass(createLowerToLLVMPass(llvmOptions));
pm.addPass(mlir::createConvertVulkanLaunchFuncToVulkanCallsPass());
}
static void insertBarrier(OpBuilder &b, Location loc) {
b.create<spirv::ControlBarrierOp>(loc, spirv::Scope::Workgroup,
spirv::Scope::Workgroup,
spirv::MemorySemantics::AcquireRelease);
}
template <typename IdOp, typename NProcsOp>
static SmallVector<linalg::ProcInfo, 2> getGpuProcIds(
OpBuilder &b, Location loc, ArrayRef<SubViewOp::Range> parallelLoopRanges) {
if (parallelLoopRanges.size() != 2)
llvm_unreachable("expected two parallel loops for matmul operation");
Type indexType = b.getIndexType();
SmallVector<linalg::ProcInfo, 2> procInfo(2);
procInfo[0] = {b.create<IdOp>(loc, indexType, b.getStringAttr("y")),
b.create<NProcsOp>(loc, indexType, b.getStringAttr("y"))};
procInfo[1] = {b.create<IdOp>(loc, indexType, b.getStringAttr("x")),
b.create<NProcsOp>(loc, indexType, b.getStringAttr("x"))};
return procInfo;
}
constexpr int numSubgroupX = 2;
constexpr int numSubgroupY = 2;
static SmallVector<linalg::ProcInfo, 2> getSubgroupIds(
OpBuilder &b, Location loc, ArrayRef<SubViewOp::Range> parallelLoopRanges) {
if (parallelLoopRanges.size() != 2)
llvm_unreachable("expected two parallel loops for matmul operation");
Type indexType = b.getIndexType();
Value sg = b.create<gpu::SubgroupIdOp>(loc, indexType);
Value vSubgroupX = b.create<ConstantIndexOp>(loc, numSubgroupX);
Value sgdiv = b.create<SignedDivIOp>(loc, indexType, sg, vSubgroupX);
Value vSubgroupY = b.create<ConstantIndexOp>(loc, numSubgroupY);
SmallVector<linalg::ProcInfo, 2> procInfo(2);
using namespace edsc::op;
procInfo[0] = {sg % vSubgroupX, vSubgroupX};
procInfo[1] = {sgdiv % vSubgroupY, vSubgroupY};
return procInfo;
}
void matMul(int m, int n, int k, int tileM, int tileN, int tileK,
bool correctness) {
const int warpSize = 32;
const int resRows = m;
const int resColumns = n;
const int reductionSize = k;
StringLiteral funcName = "kernel_matmul";
MLIRContext context;
ModelBuilder modelBuilder;
auto typeA =
modelBuilder.getMemRefType({resRows, reductionSize}, modelBuilder.i8);
auto typeB =
modelBuilder.getMemRefType({reductionSize, resColumns}, modelBuilder.i8);
auto typeC = modelBuilder.getMemRefType({resRows, resColumns},
modelBuilder.getI32Type());
// 1. Build the kernel.
{
modelBuilder.addGPUAttr();
FuncOp kernelFunc = modelBuilder.makeFunction(
funcName, {}, {typeA, typeB, typeC}, MLIRFuncOpConfig());
int workgroupSize;
if (useWorkgroupMemory)
workgroupSize = warpSize * numSubgroupX * numSubgroupY;
else
workgroupSize = warpSize;
// Right now we map one workgroup to one warp.
kernelFunc.setAttr(
spirv::getEntryPointABIAttrName(),
spirv::getEntryPointABIAttr({workgroupSize, 1, 1}, &context));
OpBuilder b(&kernelFunc.getBody());
ScopedContext scope(b, kernelFunc.getLoc());
auto A = kernelFunc.getArgument(0);
auto B = kernelFunc.getArgument(1);
auto C = kernelFunc.getArgument(2);
linalg_matmul(TypeRange{}, ValueRange{A, B, C});
std_ret();
}
// 2. Compile the function, pass in runtime support library to the execution
// engine for vector.print.
ModelRunner runner(modelBuilder.getModuleRef(),
ModelRunner::Target::GPUTarget);
CompilationOptions options;
options.loweringPasses = [&](mlir::PassManager &pm) {
MatmulCodegenStrategy strategy;
// Use hardcoded value for cooperative matrix size. Those will be pulled
// from device properties eventually.
const int cooperativeMatrixM = 16;
const int cooperativeMatrixN = 16;
const int cooperativeMatrixK = 32;
linalg::LinalgLoopDistributionOptions WGDistribute;
WGDistribute.distributionMethod = {
linalg::DistributionMethod::CyclicNumProcsEqNumIters,
linalg::DistributionMethod::CyclicNumProcsEqNumIters};
WGDistribute.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
linalg::LinalgLoopDistributionOptions SGDistribute;
SGDistribute.distributionMethod = {
linalg::DistributionMethod::CyclicNumProcsEqNumIters,
linalg::DistributionMethod::CyclicNumProcsEqNumIters};
SGDistribute.procInfo = getSubgroupIds;
strategy
.tile<linalg::MatmulOp>(
linalg::LinalgTilingOptions()
.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
.setTileSizes({tileN, tileM, tileK})
.setDistributionOptions(WGDistribute))
.setHoistInvariantCode(enableLICM);
if (useWorkgroupMemory) {
strategy
.promote<linalg::MatmulOp>(
linalg::LinalgPromotionOptions()
.setAllocationDeallocationFns(
mlir::iree_compiler::allocateWorkgroupMemory,
mlir::iree_compiler::deallocateWorkgroupMemory)
.setCopyInOutFns(mlir::iree_compiler::copyToWorkgroupMemory,
mlir::iree_compiler::copyToWorkgroupMemory)
.setOperandsToPromote({0, 1})
.setUseFullTileBuffers({false, false}))
.tile<linalg::MatmulOp>(
linalg::LinalgTilingOptions()
.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
.setTileSizes(
{tileN / numSubgroupY, tileM / numSubgroupX, tileK})
.setDistributionOptions(SGDistribute));
}
std::array<int64_t, 3> nativeSize = {cooperativeMatrixN, cooperativeMatrixM,
cooperativeMatrixK};
strategy.vectorize<linalg::MatmulOp>().unrollVector<vector::ContractionOp>(
nativeSize);
modelBuilder.getModuleRef()->walk(
[&](FuncOp fn) { strategy.transform(fn); });
addLoweringPasses(pm, {resColumns / tileN, resRows / tileM, 1},
{typeA, typeB, typeC});
};
runner.compile(options, {vulkanWrapper});
// 3. Allocate data within data structures that interoperate with the MLIR ABI
// conventions used by codegen.
auto oneInit = [](unsigned idx, uint8_t *ptr) { ptr[idx] = 2 * idx + 1; };
auto incInit = [](unsigned idx, uint8_t *ptr) { ptr[idx] = idx; };
auto zeroInit = [](unsigned idx, uint32_t *ptr) { ptr[idx] = 0; };
auto A = makeInitializedStridedMemRefDescriptor<uint8_t, 2>(
{resRows, reductionSize}, oneInit);
auto B = makeInitializedStridedMemRefDescriptor<uint8_t, 2>(
{reductionSize, resColumns}, incInit);
auto C = makeInitializedStridedMemRefDescriptor<uint32_t, 2>(
{resRows, resColumns}, zeroInit);
auto CPURes = makeInitializedStridedMemRefDescriptor<uint32_t, 2>(
{resRows, resColumns}, zeroInit);
// Is checking corretness compare to the value computed on CPU.
if (correctness) {
for (int i = 0; i < resRows; i++) {
for (int j = 0; j < resColumns; j++) {
uint32_t acc = (*C)[i][j];
for (int k = 0; k < reductionSize; k++) {
uint32_t a = (*A)[i][k];
uint32_t b = (*B)[k][j];
acc += a * b;
}
(*CPURes)[i][j] = acc;
}
}
}
// 4. Call the funcOp named `funcName`.
auto err = runner.invoke(std::string(funcName) + "_wrapper", A, B, C);
if (err) llvm_unreachable("Error running function.");
if (correctness) {
bool correct = true;
for (int i = 0; i < resRows; i++) {
for (int j = 0; j < resColumns; j++) {
if ((*CPURes)[i][j] != (*C)[i][j]) {
correct = false;
printf("mismatch at index(%i, %i) was expecting %i but got %i\n", i,
j, (*CPURes)[i][j], (*C)[i][j]);
}
}
}
if (correct) printf("pass\n");
}
}
int main(int argc, char **argv) {
ModelBuilder::registerAllDialects();
iree::Initializer::RunInitializers();
// Allow LLVM setup through command line and parse the
// test specific option for a runtime support library.
llvm::InitLLVM y(argc, argv);
llvm::cl::ParseCommandLineOptions(argc, argv, "BenchMatMulVectorGPU\n");
int m = 4096;
int n = 4096;
int k = 4096;
if (correctness) {
m = 256;
n = 256;
k = 256;
}
printf("Matrix size: %ix%ix%i", m, n, k);
for (int tileK = 32; tileK <= 64; tileK *= 2) {
for (int tileM = 32; tileM <= 256; tileM *= 2) {
for (int tileN = 32; tileN <= 256; tileN *= 2) {
// Workgroup memory requires at least a tile size of 128x128 to be able
// to do full speed copy from video memory to shared local memory.
if (useWorkgroupMemory && (tileM < 128 || tileN < 128)) continue;
printf("tileM=%i tileN=%i tileK=%i\n", tileM, tileN, tileK);
// For non-power of two tile sizes, round up the matrix size to
// be an even multiple of the tile size.
// TODO(thomasraoux): enable non power of two tiles once affine.min
// folding is fixed.
auto paddedM = (m + tileM - 1) / tileM * tileM;
auto paddedN = (n + tileN - 1) / tileN * tileN;
auto paddedK = (k + tileK - 1) / tileK * tileK;
matMul(paddedM, paddedN, paddedK, tileM, tileN, tileK, correctness);
}
}
}
}