blob: 291ccb6556fe0786481527af57c19e1aaf3f7df4 [file] [log] [blame]
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "experimental/ModelBuilder/ModelBuilder.h"
#include "experimental/ModelBuilder/ModelRunner.h"
#include "experimental/ModelBuilder/VulkanWrapperPass.h"
#include "iree/base/initializer.h"
#include "iree/compiler/Conversion/CodegenUtils/MatmulCodegenStrategy.h"
#include "iree/compiler/Conversion/LinalgToSPIRV/MemorySpace.h"
#include "iree/compiler/Conversion/LinalgToSPIRV/Passes.h"
#include "iree/compiler/Conversion/LinalgToSPIRV/Utils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/InitLLVM.h"
#include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"
#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
#include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h"
#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/SPIRV/Passes.h"
#include "mlir/Dialect/SPIRV/SPIRVOps.h"
#include "mlir/Dialect/SPIRV/TargetAndABI.h"
#include "mlir/Dialect/Vector/VectorOps.h"
#include "mlir/ExecutionEngine/CRunnerUtils.h"
#include "mlir/ExecutionEngine/RunnerUtils.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/MLIRContext.h"
#include "mlir/IR/OperationSupport.h"
#include "mlir/Parser.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/Passes.h"
using namespace mlir; // NOLINT
using namespace mlir::edsc; // NOLINT
using namespace mlir::edsc::intrinsics; // NOLINT
static llvm::cl::opt<std::string> vulkanWrapper(
"vulkan-wrapper", llvm::cl::desc("Vulkan wrapper library"),
llvm::cl::value_desc("filename"), llvm::cl::init("-"));
static llvm::cl::opt<bool> correctness(
"correctness",
llvm::cl::desc(
"Compare the result to value calculated on CPU. We will use a smaller "
"matrix multiply in this case to avoid long runtime."),
llvm::cl::init(false));
static llvm::cl::opt<bool> useWorkgroupMemory(
"use-workgroup-memory", llvm::cl::desc("Enable use of workgroup memory"),
llvm::cl::value_desc("boolean"), llvm::cl::init(false));
static llvm::cl::opt<bool> enableLICM(
"enable-licm",
llvm::cl::desc("Enable loop invariant hoisting optimizations"),
llvm::cl::value_desc("boolean"), llvm::cl::init(true));
static llvm::cl::opt<std::string> matType("matrix-type",
llvm::cl::desc("Matrix element type"),
llvm::cl::value_desc("type"),
llvm::cl::init("i8xi8xi32"));
static llvm::cl::opt<std::string> target(
"target", llvm::cl::desc("Platform target to decide the strategy"),
llvm::cl::value_desc("type"), llvm::cl::init(""));
static void addLoweringPasses(mlir::PassManager &pm,
llvm::ArrayRef<int64_t> numWorkgroups,
llvm::ArrayRef<Type> args) {
pm.addPass(mlir::iree_compiler::createVectorToGPUPass());
pm.addPass(mlir::createLowerAffinePass());
pm.addPass(mlir::createLegalizeStdOpsForSPIRVLoweringPass());
pm.addPass(mlir::createCanonicalizerPass());
pm.addPass(mlir::createCSEPass());
pm.addPass(mlir::iree_compiler::createVectorizeMemref());
pm.addPass(mlir::createCanonicalizerPass());
pm.addPass(mlir::createCSEPass());
pm.addPass(mlir::iree_compiler::createConvertToSPIRVPass());
auto &spirvModulePM = pm.nest<mlir::spirv::ModuleOp>();
spirvModulePM.addPass(mlir::createSetSpirvABIPass());
spirvModulePM.addPass(mlir::spirv::createLowerABIAttributesPass());
spirvModulePM.addPass(mlir::createCanonicalizerPass());
spirvModulePM.addPass(mlir::createCSEPass());
spirvModulePM.addPass(
mlir::spirv::createUpdateVersionCapabilityExtensionPass());
pm.addPass(mlir::createAddVulkanLaunchWrapperPass(numWorkgroups, args));
mlir::LowerToLLVMOptions llvmOptions = {
/*useBarePtrCallConv=*/false,
/*emitCWrappers=*/true,
/*indexBitwidth=*/mlir::kDeriveIndexBitwidthFromDataLayout};
pm.addPass(createLowerToLLVMPass(llvmOptions));
pm.addPass(mlir::createConvertVulkanLaunchFuncToVulkanCallsPass());
}
static void insertBarrier(OpBuilder &b, Location loc) {
b.create<spirv::ControlBarrierOp>(loc, spirv::Scope::Workgroup,
spirv::Scope::Workgroup,
spirv::MemorySemantics::AcquireRelease);
}
template <typename IdOp, typename NProcsOp>
static SmallVector<linalg::ProcInfo, 2> getGpuProcIds(
OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges) {
if (parallelLoopRanges.size() != 2)
llvm_unreachable("expected two parallel loops for matmul operation");
Type indexType = b.getIndexType();
SmallVector<linalg::ProcInfo, 2> procInfo(2);
procInfo[0] = {b.create<IdOp>(loc, indexType, b.getStringAttr("y")),
b.create<NProcsOp>(loc, indexType, b.getStringAttr("y"))};
procInfo[1] = {b.create<IdOp>(loc, indexType, b.getStringAttr("x")),
b.create<NProcsOp>(loc, indexType, b.getStringAttr("x"))};
return procInfo;
}
constexpr int numSubgroupX = 2;
constexpr int numSubgroupY = 2;
static SmallVector<linalg::ProcInfo, 2> getSubgroupIds(
OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges) {
if (parallelLoopRanges.size() != 2)
llvm_unreachable("expected two parallel loops for matmul operation");
Type indexType = b.getIndexType();
Value sg = b.create<gpu::SubgroupIdOp>(loc, indexType);
Value vSubgroupX = b.create<ConstantIndexOp>(loc, numSubgroupX);
Value sgdiv = b.create<SignedDivIOp>(loc, indexType, sg, vSubgroupX);
Value vSubgroupY = b.create<ConstantIndexOp>(loc, numSubgroupY);
SmallVector<linalg::ProcInfo, 2> procInfo(2);
using namespace edsc::op;
procInfo[0] = {sgdiv % vSubgroupY, vSubgroupY};
procInfo[1] = {sg % vSubgroupX, vSubgroupX};
return procInfo;
}
struct MatMulF32 {
using Type = float;
static mlir::Type getMLIRType(MLIRContext &ctx) {
return FloatType::getF32(&ctx);
}
};
struct MatMulI8 {
using Type = uint8_t;
static mlir::Type getMLIRType(MLIRContext &ctx) {
return IntegerType::get(8, &ctx);
}
};
struct MatMulI32 {
using Type = uint32_t;
static mlir::Type getMLIRType(MLIRContext &ctx) {
return IntegerType::get(32, &ctx);
}
};
// Class to emulate half float on CPU.
class fp16 {
public:
void fromFloat(const float &x) {
uint32_t asInt = *(uint32_t *)&x;
int sign = (asInt & 0x80000000) >> 31;
int exp = ((asInt & 0x7f800000) >> 23) - 127 + 15;
int mantissa = (asInt & 0x7FFFFF);
if (exp > 31) exp = 31;
if (exp < 0) exp = 0;
sign = sign << 15;
exp = exp << 10;
mantissa = mantissa >> (23 - 10);
asInt = sign | exp | mantissa;
value = asInt;
}
fp16(const float &x) { fromFloat(x); }
fp16 &operator=(const float &x) {
fromFloat(x);
return *this;
}
fp16 &operator=(const int &x) {
fromFloat((float)x);
return *this;
}
fp16 &operator+=(const fp16 &x) {
fromFloat(toFloat() + x.toFloat());
return *this;
}
float toFloat() const {
uint32_t asInt = value;
int sign = (asInt & 0x8000) >> 15;
int exp = ((asInt & 0x7c00) >> 10);
int mantissa = (asInt & 0x3FF);
sign = sign << 31;
if (exp > 0) {
exp = (exp + 127 - 15) << 23;
mantissa = mantissa << (23 - 10);
} else {
mantissa = 0;
}
asInt = sign | exp | mantissa;
return *(float *)&asInt;
}
operator float() { return toFloat(); }
private:
uint16_t value;
};
struct MatMulF16 {
using Type = fp16;
static mlir::Type getMLIRType(MLIRContext &ctx) {
return FloatType::getF16(&ctx);
}
};
/// Functions to initialize matrix based on the type.
template <typename T>
static T getMatA(unsigned idx) {
if (std::is_same<T, float>::value || std::is_same<T, fp16>::value)
return ((float)(idx % 5) - 1.0f) / 2.0f;
else
return (3 * idx + 1) % 117;
}
template <typename T>
static T getMatB(unsigned idx) {
if (std::is_same<T, float>::value || std::is_same<T, fp16>::value)
return ((float)(idx % 7) - 1.0f) / 2.0f;
else
return idx % 127;
}
template <typename T>
static bool EqualOrClose(T a, T b) {
if (std::is_same<T, float>::value || std::is_same<T, fp16>::value)
return fabs((float)a - (float)b) < 0.001f;
return a == b;
}
static MatmulCodegenStrategy createPowerVRStrategy(int tileM, int tileN,
int tileK, int warpSize) {
const std::array<int64_t, 3> nativeSize = {1, 1, 1};
linalg::LinalgLoopDistributionOptions WIDistribute;
linalg::LinalgLoopDistributionOptions WGDistribute;
WGDistribute.distributionMethod = {
linalg::DistributionMethod::CyclicNumProcsEqNumIters,
linalg::DistributionMethod::CyclicNumProcsEqNumIters};
WGDistribute.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
WIDistribute.distributionMethod = {
linalg::DistributionMethod::CyclicNumProcsEqNumIters,
linalg::DistributionMethod::CyclicNumProcsEqNumIters};
WIDistribute.procInfo = [warpSize](OpBuilder &b, Location loc,
ArrayRef<Range> parallelLoopRanges) {
Type indexType = b.getIndexType();
SmallVector<linalg::ProcInfo, 2> procInfo(2);
procInfo[0] = {
b.create<gpu::ThreadIdOp>(loc, indexType, b.getStringAttr("x")),
b.create<ConstantIndexOp>(loc, warpSize)};
procInfo[1] = {b.create<ConstantIndexOp>(loc, 0),
b.create<ConstantIndexOp>(loc, 1)};
return procInfo;
};
MatmulCodegenStrategy strategy;
SmallVector<int64_t, 2> promotionList;
// promote matrix B
promotionList.push_back(1);
strategy
.tile<linalg::MatmulOp>(
linalg::LinalgTilingOptions()
.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
.setTileSizes({tileM, tileN, tileK})
.setDistributionOptions(WGDistribute))
.setHoistInvariantCode(enableLICM);
if (useWorkgroupMemory) {
strategy.promote<linalg::MatmulOp>(
linalg::LinalgPromotionOptions()
.setAllocationDeallocationFns(
mlir::iree_compiler::allocateWorkgroupMemory,
mlir::iree_compiler::deallocateWorkgroupMemory)
.setCopyInOutFns(mlir::iree_compiler::copyToWorkgroupMemory,
mlir::iree_compiler::copyToWorkgroupMemory)
.setOperandsToPromote(promotionList)
.setUseFullTileBuffers({false, false}));
}
strategy.tile<linalg::MatmulOp>(
linalg::LinalgTilingOptions()
.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
.setTileSizes({1, tileN, tileK})
.setDistributionOptions(WIDistribute));
strategy.vectorize<linalg::MatmulOp>().unrollVector<vector::ContractionOp>(
nativeSize);
return strategy;
}
static MatmulCodegenStrategy createMaliStrategy(int tileM, int tileN, int tileK,
int warpSize) {
const std::array<int64_t, 3> nativeSize = {1, 4, 1};
linalg::LinalgLoopDistributionOptions WIDistribute;
linalg::LinalgLoopDistributionOptions WGDistribute;
WGDistribute.distributionMethod = {
linalg::DistributionMethod::CyclicNumProcsEqNumIters,
linalg::DistributionMethod::CyclicNumProcsEqNumIters};
WGDistribute.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
WIDistribute.distributionMethod = {
linalg::DistributionMethod::CyclicNumProcsEqNumIters,
linalg::DistributionMethod::CyclicNumProcsEqNumIters};
WIDistribute.procInfo = [warpSize](OpBuilder &b, Location loc,
ArrayRef<Range> parallelLoopRanges) {
Type indexType = b.getIndexType();
SmallVector<linalg::ProcInfo, 2> procInfo(2);
procInfo[1] = {
b.create<gpu::ThreadIdOp>(loc, indexType, b.getStringAttr("x")),
b.create<ConstantIndexOp>(loc, warpSize)};
procInfo[0] = {b.create<ConstantIndexOp>(loc, 0),
b.create<ConstantIndexOp>(loc, 1)};
return procInfo;
};
MatmulCodegenStrategy strategy;
strategy
.tile<linalg::MatmulOp>(
linalg::LinalgTilingOptions()
.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
.setTileSizes({tileM, tileN, tileK})
.setDistributionOptions(WGDistribute))
.setHoistInvariantCode(enableLICM);
strategy.tile<linalg::MatmulOp>(
linalg::LinalgTilingOptions()
.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
.setTileSizes({tileM, tileN / warpSize, tileK})
.setDistributionOptions(WIDistribute));
strategy.vectorize<linalg::MatmulOp>().unrollVector<vector::ContractionOp>(
nativeSize);
return strategy;
}
static MatmulCodegenStrategy createTuringStrategy(int tileM, int tileN,
int tileK) {
std::array<int64_t, 3> nativeSize;
if (matType == "i8xi8xi32")
nativeSize = {16, 16, 32};
else if (matType == "f16xf16xf16")
nativeSize = {16, 16, 16};
else if (matType == "f16xf16xf32")
nativeSize = {16, 16, 16};
else
llvm::errs() << "unsupported matrix type";
linalg::LinalgLoopDistributionOptions WGDistribute;
WGDistribute.distributionMethod = {
linalg::DistributionMethod::CyclicNumProcsEqNumIters,
linalg::DistributionMethod::CyclicNumProcsEqNumIters};
WGDistribute.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;
linalg::LinalgLoopDistributionOptions SGDistribute;
SGDistribute.distributionMethod = {
linalg::DistributionMethod::CyclicNumProcsEqNumIters,
linalg::DistributionMethod::CyclicNumProcsEqNumIters};
SGDistribute.procInfo = getSubgroupIds;
MatmulCodegenStrategy strategy;
strategy
.tile<linalg::MatmulOp>(
linalg::LinalgTilingOptions()
.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
.setTileSizes({tileM, tileN, tileK})
.setDistributionOptions(WGDistribute))
.setHoistInvariantCode(enableLICM);
if (useWorkgroupMemory) {
strategy
.promote<linalg::MatmulOp>(
linalg::LinalgPromotionOptions()
.setAllocationDeallocationFns(
mlir::iree_compiler::allocateWorkgroupMemory,
mlir::iree_compiler::deallocateWorkgroupMemory)
.setCopyInOutFns(mlir::iree_compiler::copyToWorkgroupMemory,
mlir::iree_compiler::copyToWorkgroupMemory)
.setOperandsToPromote({0, 1})
.setUseFullTileBuffers({false, false}))
.tile<linalg::MatmulOp>(
linalg::LinalgTilingOptions()
.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
.setTileSizes(
{tileM / numSubgroupY, tileN / numSubgroupX, tileK})
.setDistributionOptions(SGDistribute));
}
strategy.vectorize<linalg::MatmulOp>().unrollVector<vector::ContractionOp>(
nativeSize);
return strategy;
}
template <typename SrcT, typename DstT>
static void matMul(int m, int n, int k, int tileM, int tileN, int tileK,
bool correctness, int warpSize) {
const int resRows = m;
const int resColumns = n;
const int reductionSize = k;
StringLiteral funcName = "kernel_matmul";
ModelBuilder modelBuilder;
MLIRContext &ctx = *modelBuilder.getContext();
auto typeA = modelBuilder.getMemRefType({resRows, reductionSize},
SrcT::getMLIRType(ctx));
auto typeB = modelBuilder.getMemRefType({reductionSize, resColumns},
SrcT::getMLIRType(ctx));
auto typeC =
modelBuilder.getMemRefType({resRows, resColumns}, DstT::getMLIRType(ctx));
// 1. Build the kernel.
{
modelBuilder.addGPUAttr();
FuncOp kernelFunc = modelBuilder.makeFunction(
funcName, {}, {typeA, typeB, typeC}, MLIRFuncOpConfig());
int workgroupSize;
if (useWorkgroupMemory)
workgroupSize = warpSize * numSubgroupX * numSubgroupY;
else
workgroupSize = warpSize;
// Right now we map one workgroup to one warp.
kernelFunc.setAttr(
spirv::getEntryPointABIAttrName(),
spirv::getEntryPointABIAttr({workgroupSize, 1, 1}, &ctx));
OpBuilder b(&kernelFunc.getBody());
ScopedContext scope(b, kernelFunc.getLoc());
auto A = kernelFunc.getArgument(0);
auto B = kernelFunc.getArgument(1);
auto C = kernelFunc.getArgument(2);
linalg_matmul(ValueRange{A, B}, ValueRange{C});
std_ret();
}
// 2. Compile the function, pass in runtime support library to the execution
// engine for vector.print.
ModelRunner runner(modelBuilder.getModuleRef(),
ModelRunner::Target::GPUTarget);
CompilationOptions options;
options.loweringPasses = [&](mlir::PassManager &pm) {
MatmulCodegenStrategy strategy;
if (target == "powerVR") {
strategy = createPowerVRStrategy(tileM, tileN, tileK, warpSize);
} else if (target == "NVTuring") {
strategy = createTuringStrategy(tileM, tileN, tileK);
} else if (target == "mali") {
strategy = createMaliStrategy(tileM, tileN, tileK, warpSize);
}
modelBuilder.getModuleRef()->walk(
[&](FuncOp fn) { strategy.transform(fn); });
addLoweringPasses(pm, {resColumns / tileN, resRows / tileM, 1},
{typeA, typeB, typeC});
};
runner.compile(options, {vulkanWrapper});
// 3. Allocate data within data structures that interoperate with the MLIR ABI
// conventions used by codegen.
auto initA = [](unsigned idx, typename SrcT::Type *ptr) {
ptr[idx] = getMatA<typename SrcT::Type>(idx);
};
auto initB = [](unsigned idx, typename SrcT::Type *ptr) {
ptr[idx] = getMatB<typename SrcT::Type>(idx);
};
auto zeroInit = [](unsigned idx, typename DstT::Type *ptr) { ptr[idx] = 0; };
auto A = makeInitializedStridedMemRefDescriptor<typename SrcT::Type, 2>(
{resRows, reductionSize}, initA);
auto B = makeInitializedStridedMemRefDescriptor<typename SrcT::Type, 2>(
{reductionSize, resColumns}, initB);
auto C = makeInitializedStridedMemRefDescriptor<typename DstT::Type, 2>(
{resRows, resColumns}, zeroInit);
auto CPURes = makeInitializedStridedMemRefDescriptor<typename DstT::Type, 2>(
{resRows, resColumns}, zeroInit);
// Is checking corretness compare to the value computed on CPU.
if (correctness) {
for (int i = 0; i < resRows; i++) {
for (int j = 0; j < resColumns; j++) {
typename DstT::Type acc = (*C)[i][j];
for (int k = 0; k < reductionSize; k++) {
typename DstT::Type a = (*A)[i][k];
typename DstT::Type b = (*B)[k][j];
acc += a * b;
}
(*CPURes)[i][j] = acc;
}
}
}
// 4. Call the funcOp named `funcName`.
auto err = runner.invoke(std::string(funcName) + "_wrapper", A, B, C);
if (err) llvm_unreachable("Error running function.");
if (correctness) {
bool correct = true;
for (int i = 0; i < resRows; i++) {
for (int j = 0; j < resColumns; j++) {
if (!EqualOrClose((*CPURes)[i][j], (*C)[i][j])) {
correct = false;
llvm::errs() << "mismatch at index(" << i << ", " << j
<< ") was expecting " << (*CPURes)[i][j] << " but got "
<< (*C)[i][j] << "\n";
}
}
}
if (correct) printf("pass\n");
}
}
static void matMul(int m, int n, int k, int tileM, int tileN, int tileK,
bool correctness, int warpSize) {
if (matType == "i8xi8xi32") {
return matMul<MatMulI8, MatMulI32>(m, n, k, tileM, tileN, tileK,
correctness, warpSize);
}
if (matType == "f16xf16xf16") {
return matMul<MatMulF16, MatMulF16>(m, n, k, tileM, tileN, tileK,
correctness, warpSize);
}
if (matType == "f16xf16xf32") {
return matMul<MatMulF16, MatMulF32>(m, n, k, tileM, tileN, tileK,
correctness, warpSize);
}
if (matType == "f32xf32xf32") {
return matMul<MatMulF32, MatMulF32>(m, n, k, tileM, tileN, tileK,
correctness, warpSize);
}
llvm_unreachable("Unsupported matrix type");
}
int main(int argc, char **argv) {
iree::Initializer::RunInitializers();
// Allow LLVM setup through command line and parse the
// test specific option for a runtime support library.
llvm::InitLLVM y(argc, argv);
llvm::cl::ParseCommandLineOptions(argc, argv, "BenchMatMulVectorGPU\n");
if (target.empty()) {
llvm::errs() << "No target specified.";
return 0;
}
int m = 4096;
int n = 4096;
int k = 4096;
if (correctness) {
m = 256;
n = 256;
k = 256;
}
int warpSize = 32;
std::pair<int, int> tileMRange;
std::pair<int, int> tileNRange;
std::pair<int, int> tileKRange;
if (target == "powerVR") {
m = std::max(m, 1024);
n = std::max(n, 1024);
k = std::max(k, 1024);
tileMRange = {32, 32};
tileNRange = {32, 32};
tileKRange = {4, 4};
} else if (target == "NVTuring") {
tileMRange = {32, 256};
tileNRange = {32, 256};
tileKRange = {32, 64};
// Workgroup memory requires at least a tile size of 128x128 to be able
// to do full speed copy from video memory to shared local memory.
if (useWorkgroupMemory) {
tileMRange.first = 128;
tileNRange.first = 128;
}
} else if (target == "mali") {
warpSize = 16;
tileMRange = {1, 8};
tileNRange = {64, 128};
tileKRange = {4, 4};
} else {
llvm::errs() << "Unknown target";
return 0;
}
printf("Matrix size: %ix%ix%i\n", m, n, k);
for (int tileK = tileKRange.first; tileK <= tileKRange.second; tileK *= 2) {
for (int tileM = tileMRange.first; tileM <= tileMRange.second; tileM *= 2) {
for (int tileN = tileNRange.first; tileN <= tileNRange.second;
tileN *= 2) {
printf("tileM=%i tileN=%i tileK=%i\n", tileM, tileN, tileK);
// For non-power of two tile sizes, round up the matrix size to
// be an even multiple of the tile size.
// TODO(thomasraoux): enable non power of two tiles once affine.min
// folding is fixed.
auto paddedM = (m + tileM - 1) / tileM * tileM;
auto paddedN = (n + tileN - 1) / tileN * tileN;
auto paddedK = (k + tileK - 1) / tileK * tileK;
matMul(paddedM, paddedN, paddedK, tileM, tileN, tileK, correctness,
warpSize);
}
}
}
}