experimental/ModelBuilder/test/BenchMatMulVectorGPU.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2020 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <string>

 #include "experimental/ModelBuilder/ModelBuilder.h"
 #include "experimental/ModelBuilder/ModelRunner.h"
 #include "experimental/ModelBuilder/VulkanWrapperPass.h"
 #include "iree/base/initializer.h"
 #include "iree/compiler/Conversion/CodegenUtils/MatmulCodegenStrategy.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/MemorySpace.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/Passes.h"
 #include "iree/compiler/Conversion/LinalgToSPIRV/Utils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
 #include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"
 #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
 #include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h"
 #include "mlir/Dialect/GPU/Passes.h"
 #include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SPIRV/Passes.h"
 #include "mlir/Dialect/SPIRV/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/TargetAndABI.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/ExecutionEngine/CRunnerUtils.h"
 #include "mlir/ExecutionEngine/RunnerUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/Parser.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"

 using namespace mlir;                    // NOLINT
 using namespace mlir::edsc;              // NOLINT
 using namespace mlir::edsc::intrinsics;  // NOLINT

 static llvm::cl::opt<std::string> vulkanWrapper(
     "vulkan-wrapper", llvm::cl::desc("Vulkan wrapper library"),
     llvm::cl::value_desc("filename"), llvm::cl::init("-"));

 static llvm::cl::opt<bool> correctness(
     "correctness",
     llvm::cl::desc(
         "Compare the result to value calculated on CPU. We will use a smaller "
         "matrix multiply in this case to avoid long runtime."),
     llvm::cl::init(false));

 static llvm::cl::opt<bool> useWorkgroupMemory(
     "use-workgroup-memory", llvm::cl::desc("Enable use of workgroup memory"),
     llvm::cl::value_desc("boolean"), llvm::cl::init(false));

 static llvm::cl::opt<bool> enableLICM(
     "enable-licm",
     llvm::cl::desc("Enable loop invariant hoisting optimizations"),
     llvm::cl::value_desc("boolean"), llvm::cl::init(true));

 static llvm::cl::opt<std::string> matType("matrix-type",
                                           llvm::cl::desc("Matrix element type"),
                                           llvm::cl::value_desc("type"),
                                           llvm::cl::init("i8xi8xi32"));

 static llvm::cl::opt<std::string> target(
     "target", llvm::cl::desc("Platform target to decide the strategy"),
     llvm::cl::value_desc("type"), llvm::cl::init(""));

 static void addLoweringPasses(mlir::PassManager &pm,
                               llvm::ArrayRef<int64_t> numWorkgroups,
                               llvm::ArrayRef<Type> args) {
   pm.addPass(mlir::iree_compiler::createVectorToGPUPass());
   pm.addPass(mlir::createLowerAffinePass());
   pm.addPass(mlir::createLegalizeStdOpsForSPIRVLoweringPass());
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::iree_compiler::createVectorizeMemref());
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::iree_compiler::createConvertToSPIRVPass());

   auto &spirvModulePM = pm.nest<mlir::spirv::ModuleOp>();
   spirvModulePM.addPass(mlir::createSetSpirvABIPass());
   spirvModulePM.addPass(mlir::spirv::createLowerABIAttributesPass());
   spirvModulePM.addPass(mlir::createCanonicalizerPass());
   spirvModulePM.addPass(mlir::createCSEPass());
   spirvModulePM.addPass(
       mlir::spirv::createUpdateVersionCapabilityExtensionPass());

   pm.addPass(mlir::createAddVulkanLaunchWrapperPass(numWorkgroups, args));
   mlir::LowerToLLVMOptions llvmOptions = {
       /*useBarePtrCallConv=*/false,
       /*emitCWrappers=*/true,
       /*indexBitwidth=*/mlir::kDeriveIndexBitwidthFromDataLayout};
   pm.addPass(createLowerToLLVMPass(llvmOptions));
   pm.addPass(mlir::createConvertVulkanLaunchFuncToVulkanCallsPass());
 }

 static void insertBarrier(OpBuilder &b, Location loc) {
   b.create<spirv::ControlBarrierOp>(loc, spirv::Scope::Workgroup,
                                     spirv::Scope::Workgroup,
                                     spirv::MemorySemantics::AcquireRelease);
 }

 template <typename IdOp, typename NProcsOp>
 static SmallVector<linalg::ProcInfo, 2> getGpuProcIds(
     OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges) {
   if (parallelLoopRanges.size() != 2)
     llvm_unreachable("expected two parallel loops for matmul operation");
   Type indexType = b.getIndexType();
   SmallVector<linalg::ProcInfo, 2> procInfo(2);
   procInfo[0] = {b.create<IdOp>(loc, indexType, b.getStringAttr("y")),
                  b.create<NProcsOp>(loc, indexType, b.getStringAttr("y"))};
   procInfo[1] = {b.create<IdOp>(loc, indexType, b.getStringAttr("x")),
                  b.create<NProcsOp>(loc, indexType, b.getStringAttr("x"))};
   return procInfo;
 }

 constexpr int numSubgroupX = 2;
 constexpr int numSubgroupY = 2;

 static SmallVector<linalg::ProcInfo, 2> getSubgroupIds(
     OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges) {
   if (parallelLoopRanges.size() != 2)
     llvm_unreachable("expected two parallel loops for matmul operation");
   Type indexType = b.getIndexType();
   Value sg = b.create<gpu::SubgroupIdOp>(loc, indexType);
   Value vSubgroupX = b.create<ConstantIndexOp>(loc, numSubgroupX);
   Value sgdiv = b.create<SignedDivIOp>(loc, indexType, sg, vSubgroupX);
   Value vSubgroupY = b.create<ConstantIndexOp>(loc, numSubgroupY);
   SmallVector<linalg::ProcInfo, 2> procInfo(2);
   using namespace edsc::op;
   procInfo[0] = {sgdiv % vSubgroupY, vSubgroupY};
   procInfo[1] = {sg % vSubgroupX, vSubgroupX};
   return procInfo;
 }

 struct MatMulF32 {
   using Type = float;
   static mlir::Type getMLIRType(MLIRContext &ctx) {
     return FloatType::getF32(&ctx);
   }
 };

 struct MatMulI8 {
   using Type = uint8_t;
   static mlir::Type getMLIRType(MLIRContext &ctx) {
     return IntegerType::get(8, &ctx);
   }
 };

 struct MatMulI32 {
   using Type = uint32_t;
   static mlir::Type getMLIRType(MLIRContext &ctx) {
     return IntegerType::get(32, &ctx);
   }
 };

 // Class to emulate half float on CPU.
 class fp16 {
  public:
   void fromFloat(const float &x) {
     uint32_t asInt = *(uint32_t *)&x;
     int sign = (asInt & 0x80000000) >> 31;
     int exp = ((asInt & 0x7f800000) >> 23) - 127 + 15;
     int mantissa = (asInt & 0x7FFFFF);
     if (exp > 31) exp = 31;
     if (exp < 0) exp = 0;
     sign = sign << 15;
     exp = exp << 10;
     mantissa = mantissa >> (23 - 10);
     asInt = sign | exp | mantissa;
     value = asInt;
   }
   fp16(const float &x) { fromFloat(x); }
   fp16 &operator=(const float &x) {
     fromFloat(x);
     return *this;
   }
   fp16 &operator=(const int &x) {
     fromFloat((float)x);
     return *this;
   }
   fp16 &operator+=(const fp16 &x) {
     fromFloat(toFloat() + x.toFloat());
     return *this;
   }
   float toFloat() const {
     uint32_t asInt = value;
     int sign = (asInt & 0x8000) >> 15;
     int exp = ((asInt & 0x7c00) >> 10);
     int mantissa = (asInt & 0x3FF);
     sign = sign << 31;
     if (exp > 0) {
       exp = (exp + 127 - 15) << 23;
       mantissa = mantissa << (23 - 10);
     } else {
       mantissa = 0;
     }
     asInt = sign | exp | mantissa;
     return *(float *)&asInt;
   }
   operator float() { return toFloat(); }

  private:
   uint16_t value;
 };

 struct MatMulF16 {
   using Type = fp16;
   static mlir::Type getMLIRType(MLIRContext &ctx) {
     return FloatType::getF16(&ctx);
   }
 };

 /// Functions to initialize matrix based on the type.
 template <typename T>
 static T getMatA(unsigned idx) {
   if (std::is_same<T, float>::value || std::is_same<T, fp16>::value)
     return ((float)(idx % 5) - 1.0f) / 2.0f;
   else
     return (3 * idx + 1) % 117;
 }

 template <typename T>
 static T getMatB(unsigned idx) {
   if (std::is_same<T, float>::value || std::is_same<T, fp16>::value)
     return ((float)(idx % 7) - 1.0f) / 2.0f;
   else
     return idx % 127;
 }

 template <typename T>
 static bool EqualOrClose(T a, T b) {
   if (std::is_same<T, float>::value || std::is_same<T, fp16>::value)
     return fabs((float)a - (float)b) < 0.001f;
   return a == b;
 }

 static MatmulCodegenStrategy createPowerVRStrategy(int tileM, int tileN,
                                                    int tileK, int warpSize) {
   const std::array<int64_t, 3> nativeSize = {1, 1, 1};
   linalg::LinalgLoopDistributionOptions WIDistribute;
   linalg::LinalgLoopDistributionOptions WGDistribute;
   WGDistribute.distributionMethod = {
       linalg::DistributionMethod::CyclicNumProcsEqNumIters,
       linalg::DistributionMethod::CyclicNumProcsEqNumIters};
   WGDistribute.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;

   WIDistribute.distributionMethod = {
       linalg::DistributionMethod::CyclicNumProcsEqNumIters,
       linalg::DistributionMethod::CyclicNumProcsEqNumIters};
   WIDistribute.procInfo = [warpSize](OpBuilder &b, Location loc,
                                      ArrayRef<Range> parallelLoopRanges) {
     Type indexType = b.getIndexType();
     SmallVector<linalg::ProcInfo, 2> procInfo(2);
     procInfo[0] = {
         b.create<gpu::ThreadIdOp>(loc, indexType, b.getStringAttr("x")),
         b.create<ConstantIndexOp>(loc, warpSize)};
     procInfo[1] = {b.create<ConstantIndexOp>(loc, 0),
                    b.create<ConstantIndexOp>(loc, 1)};
     return procInfo;
   };
   MatmulCodegenStrategy strategy;
   SmallVector<int64_t, 2> promotionList;
   // promote matrix B
   promotionList.push_back(1);
   strategy
       .tile<linalg::MatmulOp>(
           linalg::LinalgTilingOptions()
               .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
               .setTileSizes({tileM, tileN, tileK})
               .setDistributionOptions(WGDistribute))
       .setHoistInvariantCode(enableLICM);
   if (useWorkgroupMemory) {
     strategy.promote<linalg::MatmulOp>(
         linalg::LinalgPromotionOptions()
             .setAllocationDeallocationFns(
                 mlir::iree_compiler::allocateWorkgroupMemory,
                 mlir::iree_compiler::deallocateWorkgroupMemory)
             .setCopyInOutFns(mlir::iree_compiler::copyToWorkgroupMemory,
                              mlir::iree_compiler::copyToWorkgroupMemory)
             .setOperandsToPromote(promotionList)
             .setUseFullTileBuffers({false, false}));
   }
   strategy.tile<linalg::MatmulOp>(
       linalg::LinalgTilingOptions()
           .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
           .setTileSizes({1, tileN, tileK})
           .setDistributionOptions(WIDistribute));
   strategy.vectorize<linalg::MatmulOp>().unrollVector<vector::ContractionOp>(
       nativeSize);
   return strategy;
 }

 static MatmulCodegenStrategy createMaliStrategy(int tileM, int tileN, int tileK,
                                                 int warpSize) {
   const std::array<int64_t, 3> nativeSize = {1, 4, 1};
   linalg::LinalgLoopDistributionOptions WIDistribute;
   linalg::LinalgLoopDistributionOptions WGDistribute;
   WGDistribute.distributionMethod = {
       linalg::DistributionMethod::CyclicNumProcsEqNumIters,
       linalg::DistributionMethod::CyclicNumProcsEqNumIters};
   WGDistribute.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;

   WIDistribute.distributionMethod = {
       linalg::DistributionMethod::CyclicNumProcsEqNumIters,
       linalg::DistributionMethod::CyclicNumProcsEqNumIters};
   WIDistribute.procInfo = [warpSize](OpBuilder &b, Location loc,
                                      ArrayRef<Range> parallelLoopRanges) {
     Type indexType = b.getIndexType();
     SmallVector<linalg::ProcInfo, 2> procInfo(2);
     procInfo[1] = {
         b.create<gpu::ThreadIdOp>(loc, indexType, b.getStringAttr("x")),
         b.create<ConstantIndexOp>(loc, warpSize)};
     procInfo[0] = {b.create<ConstantIndexOp>(loc, 0),
                    b.create<ConstantIndexOp>(loc, 1)};
     return procInfo;
   };
   MatmulCodegenStrategy strategy;
   strategy
       .tile<linalg::MatmulOp>(
           linalg::LinalgTilingOptions()
               .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
               .setTileSizes({tileM, tileN, tileK})
               .setDistributionOptions(WGDistribute))
       .setHoistInvariantCode(enableLICM);
   strategy.tile<linalg::MatmulOp>(
       linalg::LinalgTilingOptions()
           .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
           .setTileSizes({tileM, tileN / warpSize, tileK})
           .setDistributionOptions(WIDistribute));
   strategy.vectorize<linalg::MatmulOp>().unrollVector<vector::ContractionOp>(
       nativeSize);
   return strategy;
 }

 static MatmulCodegenStrategy createTuringStrategy(int tileM, int tileN,
                                                   int tileK) {
   std::array<int64_t, 3> nativeSize;
   if (matType == "i8xi8xi32")
     nativeSize = {16, 16, 32};
   else if (matType == "f16xf16xf16")
     nativeSize = {16, 16, 16};
   else if (matType == "f16xf16xf32")
     nativeSize = {16, 16, 16};
   else
     llvm::errs() << "unsupported matrix type";
   linalg::LinalgLoopDistributionOptions WGDistribute;
   WGDistribute.distributionMethod = {
       linalg::DistributionMethod::CyclicNumProcsEqNumIters,
       linalg::DistributionMethod::CyclicNumProcsEqNumIters};
   WGDistribute.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;

   linalg::LinalgLoopDistributionOptions SGDistribute;
   SGDistribute.distributionMethod = {
       linalg::DistributionMethod::CyclicNumProcsEqNumIters,
       linalg::DistributionMethod::CyclicNumProcsEqNumIters};
   SGDistribute.procInfo = getSubgroupIds;

   MatmulCodegenStrategy strategy;
   strategy
       .tile<linalg::MatmulOp>(
           linalg::LinalgTilingOptions()
               .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
               .setTileSizes({tileM, tileN, tileK})
               .setDistributionOptions(WGDistribute))
       .setHoistInvariantCode(enableLICM);
   if (useWorkgroupMemory) {
     strategy
         .promote<linalg::MatmulOp>(
             linalg::LinalgPromotionOptions()
                 .setAllocationDeallocationFns(
                     mlir::iree_compiler::allocateWorkgroupMemory,
                     mlir::iree_compiler::deallocateWorkgroupMemory)
                 .setCopyInOutFns(mlir::iree_compiler::copyToWorkgroupMemory,
                                  mlir::iree_compiler::copyToWorkgroupMemory)
                 .setOperandsToPromote({0, 1})
                 .setUseFullTileBuffers({false, false}))
         .tile<linalg::MatmulOp>(
             linalg::LinalgTilingOptions()
                 .setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
                 .setTileSizes(
                     {tileM / numSubgroupY, tileN / numSubgroupX, tileK})
                 .setDistributionOptions(SGDistribute));
   }
   strategy.vectorize<linalg::MatmulOp>().unrollVector<vector::ContractionOp>(
       nativeSize);
   return strategy;
 }

 template <typename SrcT, typename DstT>
 static void matMul(int m, int n, int k, int tileM, int tileN, int tileK,
                    bool correctness, int warpSize) {
   const int resRows = m;
   const int resColumns = n;
   const int reductionSize = k;
   StringLiteral funcName = "kernel_matmul";
   ModelBuilder modelBuilder;
   MLIRContext &ctx = *modelBuilder.getContext();
   auto typeA = modelBuilder.getMemRefType({resRows, reductionSize},
                                           SrcT::getMLIRType(ctx));
   auto typeB = modelBuilder.getMemRefType({reductionSize, resColumns},
                                           SrcT::getMLIRType(ctx));
   auto typeC =
       modelBuilder.getMemRefType({resRows, resColumns}, DstT::getMLIRType(ctx));
   // 1. Build the kernel.
   {
     modelBuilder.addGPUAttr();
     FuncOp kernelFunc = modelBuilder.makeFunction(
         funcName, {}, {typeA, typeB, typeC}, MLIRFuncOpConfig());
     int workgroupSize;
     if (useWorkgroupMemory)
       workgroupSize = warpSize * numSubgroupX * numSubgroupY;
     else
       workgroupSize = warpSize;
     // Right now we map one workgroup to one warp.
     kernelFunc.setAttr(
         spirv::getEntryPointABIAttrName(),
         spirv::getEntryPointABIAttr({workgroupSize, 1, 1}, &ctx));
     OpBuilder b(&kernelFunc.getBody());
     ScopedContext scope(b, kernelFunc.getLoc());

     auto A = kernelFunc.getArgument(0);
     auto B = kernelFunc.getArgument(1);
     auto C = kernelFunc.getArgument(2);

     linalg_matmul(ValueRange{A, B}, ValueRange{C});
     std_ret();
   }

   // 2. Compile the function, pass in runtime support library to the execution
   // engine for vector.print.
   ModelRunner runner(modelBuilder.getModuleRef(),
                      ModelRunner::Target::GPUTarget);
   CompilationOptions options;
   options.loweringPasses = [&](mlir::PassManager &pm) {
     MatmulCodegenStrategy strategy;

     if (target == "powerVR") {
       strategy = createPowerVRStrategy(tileM, tileN, tileK, warpSize);
     } else if (target == "NVTuring") {
       strategy = createTuringStrategy(tileM, tileN, tileK);
     } else if (target == "mali") {
       strategy = createMaliStrategy(tileM, tileN, tileK, warpSize);
     }
     modelBuilder.getModuleRef()->walk(
         [&](FuncOp fn) { strategy.transform(fn); });
     addLoweringPasses(pm, {resColumns / tileN, resRows / tileM, 1},
                       {typeA, typeB, typeC});
   };
   runner.compile(options, {vulkanWrapper});

   // 3. Allocate data within data structures that interoperate with the MLIR ABI
   // conventions used by codegen.
   auto initA = [](unsigned idx, typename SrcT::Type *ptr) {
     ptr[idx] = getMatA<typename SrcT::Type>(idx);
   };
   auto initB = [](unsigned idx, typename SrcT::Type *ptr) {
     ptr[idx] = getMatB<typename SrcT::Type>(idx);
   };
   auto zeroInit = [](unsigned idx, typename DstT::Type *ptr) { ptr[idx] = 0; };
   auto A = makeInitializedStridedMemRefDescriptor<typename SrcT::Type, 2>(
       {resRows, reductionSize}, initA);
   auto B = makeInitializedStridedMemRefDescriptor<typename SrcT::Type, 2>(
       {reductionSize, resColumns}, initB);
   auto C = makeInitializedStridedMemRefDescriptor<typename DstT::Type, 2>(
       {resRows, resColumns}, zeroInit);
   auto CPURes = makeInitializedStridedMemRefDescriptor<typename DstT::Type, 2>(
       {resRows, resColumns}, zeroInit);

   // Is checking corretness compare to the value computed on CPU.
   if (correctness) {
     for (int i = 0; i < resRows; i++) {
       for (int j = 0; j < resColumns; j++) {
         typename DstT::Type acc = (*C)[i][j];
         for (int k = 0; k < reductionSize; k++) {
           typename DstT::Type a = (*A)[i][k];
           typename DstT::Type b = (*B)[k][j];
           acc += a * b;
         }
         (*CPURes)[i][j] = acc;
       }
     }
   }

   // 4. Call the funcOp named `funcName`.
   auto err = runner.invoke(std::string(funcName) + "_wrapper", A, B, C);
   if (err) llvm_unreachable("Error running function.");

   if (correctness) {
     bool correct = true;
     for (int i = 0; i < resRows; i++) {
       for (int j = 0; j < resColumns; j++) {
         if (!EqualOrClose((*CPURes)[i][j], (*C)[i][j])) {
           correct = false;
           llvm::errs() << "mismatch at index(" << i << ", " << j
                        << ") was expecting " << (*CPURes)[i][j] << " but got "
                        << (*C)[i][j] << "\n";
         }
       }
     }
     if (correct) printf("pass\n");
   }
 }

 static void matMul(int m, int n, int k, int tileM, int tileN, int tileK,
                    bool correctness, int warpSize) {
   if (matType == "i8xi8xi32") {
     return matMul<MatMulI8, MatMulI32>(m, n, k, tileM, tileN, tileK,
                                        correctness, warpSize);
   }
   if (matType == "f16xf16xf16") {
     return matMul<MatMulF16, MatMulF16>(m, n, k, tileM, tileN, tileK,
                                         correctness, warpSize);
   }
   if (matType == "f16xf16xf32") {
     return matMul<MatMulF16, MatMulF32>(m, n, k, tileM, tileN, tileK,
                                         correctness, warpSize);
   }
   if (matType == "f32xf32xf32") {
     return matMul<MatMulF32, MatMulF32>(m, n, k, tileM, tileN, tileK,
                                         correctness, warpSize);
   }
   llvm_unreachable("Unsupported matrix type");
 }

 int main(int argc, char **argv) {
   iree::Initializer::RunInitializers();
   // Allow LLVM setup through command line and parse the
   // test specific option for a runtime support library.
   llvm::InitLLVM y(argc, argv);
   llvm::cl::ParseCommandLineOptions(argc, argv, "BenchMatMulVectorGPU\n");
   if (target.empty()) {
     llvm::errs() << "No target specified.";
     return 0;
   }
   int m = 4096;
   int n = 4096;
   int k = 4096;
   if (correctness) {
     m = 256;
     n = 256;
     k = 256;
   }
   int warpSize = 32;
   std::pair<int, int> tileMRange;
   std::pair<int, int> tileNRange;
   std::pair<int, int> tileKRange;
   if (target == "powerVR") {
     m = std::max(m, 1024);
     n = std::max(n, 1024);
     k = std::max(k, 1024);
     tileMRange = {32, 32};
     tileNRange = {32, 32};
     tileKRange = {4, 4};
   } else if (target == "NVTuring") {
     tileMRange = {32, 256};
     tileNRange = {32, 256};
     tileKRange = {32, 64};
     // Workgroup memory requires at least a tile size of 128x128 to be able
     // to do full speed copy from video memory to shared local memory.
     if (useWorkgroupMemory) {
       tileMRange.first = 128;
       tileNRange.first = 128;
     }
   } else if (target == "mali") {
     warpSize = 16;
     tileMRange = {1, 8};
     tileNRange = {64, 128};
     tileKRange = {4, 4};
   } else {
     llvm::errs() << "Unknown target";
     return 0;
   }

   printf("Matrix size: %ix%ix%i\n", m, n, k);
   for (int tileK = tileKRange.first; tileK <= tileKRange.second; tileK *= 2) {
     for (int tileM = tileMRange.first; tileM <= tileMRange.second; tileM *= 2) {
       for (int tileN = tileNRange.first; tileN <= tileNRange.second;
            tileN *= 2) {
         printf("tileM=%i tileN=%i tileK=%i\n", tileM, tileN, tileK);
         // For non-power of two tile sizes, round up the matrix size to
         // be an even multiple of the tile size.
         // TODO(thomasraoux): enable non power of two tiles once affine.min
         // folding is fixed.
         auto paddedM = (m + tileM - 1) / tileM * tileM;
         auto paddedN = (n + tileN - 1) / tileN * tileN;
         auto paddedK = (k + tileK - 1) / tileK * tileK;

         matMul(paddedM, paddedN, paddedK, tileM, tileN, tileK, correctness,
                warpSize);
       }
     }
   }
 }
	// Copyright 2020 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	#include <string>

	#include "experimental/ModelBuilder/ModelBuilder.h"
	#include "experimental/ModelBuilder/ModelRunner.h"
	#include "experimental/ModelBuilder/VulkanWrapperPass.h"
	#include "iree/base/initializer.h"
	#include "iree/compiler/Conversion/CodegenUtils/MatmulCodegenStrategy.h"
	#include "iree/compiler/Conversion/LinalgToSPIRV/MemorySpace.h"
	#include "iree/compiler/Conversion/LinalgToSPIRV/Passes.h"
	#include "iree/compiler/Conversion/LinalgToSPIRV/Utils.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/InitLLVM.h"
	#include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"
	#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
	#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
	#include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h"
	#include "mlir/Dialect/GPU/Passes.h"
	#include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"
	#include "mlir/Dialect/Linalg/Passes.h"
	#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
	#include "mlir/Dialect/SPIRV/Passes.h"
	#include "mlir/Dialect/SPIRV/SPIRVOps.h"
	#include "mlir/Dialect/SPIRV/TargetAndABI.h"
	#include "mlir/Dialect/Vector/VectorOps.h"
	#include "mlir/ExecutionEngine/CRunnerUtils.h"
	#include "mlir/ExecutionEngine/RunnerUtils.h"
	#include "mlir/IR/Builders.h"
	#include "mlir/IR/MLIRContext.h"
	#include "mlir/IR/OperationSupport.h"
	#include "mlir/Parser.h"
	#include "mlir/Pass/Pass.h"
	#include "mlir/Pass/PassManager.h"
	#include "mlir/Transforms/Passes.h"

	using namespace mlir; // NOLINT
	using namespace mlir::edsc; // NOLINT
	using namespace mlir::edsc::intrinsics; // NOLINT

	static llvm::cl::opt<std::string> vulkanWrapper(
	"vulkan-wrapper", llvm::cl::desc("Vulkan wrapper library"),
	llvm::cl::value_desc("filename"), llvm::cl::init("-"));

	static llvm::cl::opt<bool> correctness(
	"correctness",
	llvm::cl::desc(
	"Compare the result to value calculated on CPU. We will use a smaller "
	"matrix multiply in this case to avoid long runtime."),
	llvm::cl::init(false));

	static llvm::cl::opt<bool> useWorkgroupMemory(
	"use-workgroup-memory", llvm::cl::desc("Enable use of workgroup memory"),
	llvm::cl::value_desc("boolean"), llvm::cl::init(false));

	static llvm::cl::opt<bool> enableLICM(
	"enable-licm",
	llvm::cl::desc("Enable loop invariant hoisting optimizations"),
	llvm::cl::value_desc("boolean"), llvm::cl::init(true));

	static llvm::cl::opt<std::string> matType("matrix-type",
	llvm::cl::desc("Matrix element type"),
	llvm::cl::value_desc("type"),
	llvm::cl::init("i8xi8xi32"));

	static llvm::cl::opt<std::string> target(
	"target", llvm::cl::desc("Platform target to decide the strategy"),
	llvm::cl::value_desc("type"), llvm::cl::init(""));

	static void addLoweringPasses(mlir::PassManager &pm,
	llvm::ArrayRef<int64_t> numWorkgroups,
	llvm::ArrayRef<Type> args) {
	pm.addPass(mlir::iree_compiler::createVectorToGPUPass());
	pm.addPass(mlir::createLowerAffinePass());
	pm.addPass(mlir::createLegalizeStdOpsForSPIRVLoweringPass());
	pm.addPass(mlir::createCanonicalizerPass());
	pm.addPass(mlir::createCSEPass());
	pm.addPass(mlir::iree_compiler::createVectorizeMemref());
	pm.addPass(mlir::createCanonicalizerPass());
	pm.addPass(mlir::createCSEPass());
	pm.addPass(mlir::iree_compiler::createConvertToSPIRVPass());

	auto &spirvModulePM = pm.nest<mlir::spirv::ModuleOp>();
	spirvModulePM.addPass(mlir::createSetSpirvABIPass());
	spirvModulePM.addPass(mlir::spirv::createLowerABIAttributesPass());
	spirvModulePM.addPass(mlir::createCanonicalizerPass());
	spirvModulePM.addPass(mlir::createCSEPass());
	spirvModulePM.addPass(
	mlir::spirv::createUpdateVersionCapabilityExtensionPass());

	pm.addPass(mlir::createAddVulkanLaunchWrapperPass(numWorkgroups, args));
	mlir::LowerToLLVMOptions llvmOptions = {
	/useBarePtrCallConv=/false,
	/emitCWrappers=/true,
	/indexBitwidth=/mlir::kDeriveIndexBitwidthFromDataLayout};
	pm.addPass(createLowerToLLVMPass(llvmOptions));
	pm.addPass(mlir::createConvertVulkanLaunchFuncToVulkanCallsPass());
	}

	static void insertBarrier(OpBuilder &b, Location loc) {
	b.create<spirv::ControlBarrierOp>(loc, spirv::Scope::Workgroup,
	spirv::Scope::Workgroup,
	spirv::MemorySemantics::AcquireRelease);
	}

	template <typename IdOp, typename NProcsOp>
	static SmallVector<linalg::ProcInfo, 2> getGpuProcIds(
	OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges) {
	if (parallelLoopRanges.size() != 2)
	llvm_unreachable("expected two parallel loops for matmul operation");
	Type indexType = b.getIndexType();
	SmallVector<linalg::ProcInfo, 2> procInfo(2);
	procInfo[0] = {b.create<IdOp>(loc, indexType, b.getStringAttr("y")),
	b.create<NProcsOp>(loc, indexType, b.getStringAttr("y"))};
	procInfo[1] = {b.create<IdOp>(loc, indexType, b.getStringAttr("x")),
	b.create<NProcsOp>(loc, indexType, b.getStringAttr("x"))};
	return procInfo;
	}

	constexpr int numSubgroupX = 2;
	constexpr int numSubgroupY = 2;

	static SmallVector<linalg::ProcInfo, 2> getSubgroupIds(
	OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges) {
	if (parallelLoopRanges.size() != 2)
	llvm_unreachable("expected two parallel loops for matmul operation");
	Type indexType = b.getIndexType();
	Value sg = b.create<gpu::SubgroupIdOp>(loc, indexType);
	Value vSubgroupX = b.create<ConstantIndexOp>(loc, numSubgroupX);
	Value sgdiv = b.create<SignedDivIOp>(loc, indexType, sg, vSubgroupX);
	Value vSubgroupY = b.create<ConstantIndexOp>(loc, numSubgroupY);
	SmallVector<linalg::ProcInfo, 2> procInfo(2);
	using namespace edsc::op;
	procInfo[0] = {sgdiv % vSubgroupY, vSubgroupY};
	procInfo[1] = {sg % vSubgroupX, vSubgroupX};
	return procInfo;
	}

	struct MatMulF32 {
	using Type = float;
	static mlir::Type getMLIRType(MLIRContext &ctx) {
	return FloatType::getF32(&ctx);
	}
	};

	struct MatMulI8 {
	using Type = uint8_t;
	static mlir::Type getMLIRType(MLIRContext &ctx) {
	return IntegerType::get(8, &ctx);
	}
	};

	struct MatMulI32 {
	using Type = uint32_t;
	static mlir::Type getMLIRType(MLIRContext &ctx) {
	return IntegerType::get(32, &ctx);
	}
	};

	// Class to emulate half float on CPU.
	class fp16 {
	public:
	void fromFloat(const float &x) {
	uint32_t asInt = (uint32_t )&x;
	int sign = (asInt & 0x80000000) >> 31;
	int exp = ((asInt & 0x7f800000) >> 23) - 127 + 15;
	int mantissa = (asInt & 0x7FFFFF);
	if (exp > 31) exp = 31;
	if (exp < 0) exp = 0;
	sign = sign << 15;
	exp = exp << 10;
	mantissa = mantissa >> (23 - 10);
	asInt = sign \| exp \| mantissa;
	value = asInt;
	}
	fp16(const float &x) { fromFloat(x); }
	fp16 &operator=(const float &x) {
	fromFloat(x);
	return *this;
	}
	fp16 &operator=(const int &x) {
	fromFloat((float)x);
	return *this;
	}
	fp16 &operator+=(const fp16 &x) {
	fromFloat(toFloat() + x.toFloat());
	return *this;
	}
	float toFloat() const {
	uint32_t asInt = value;
	int sign = (asInt & 0x8000) >> 15;
	int exp = ((asInt & 0x7c00) >> 10);
	int mantissa = (asInt & 0x3FF);
	sign = sign << 31;
	if (exp > 0) {
	exp = (exp + 127 - 15) << 23;
	mantissa = mantissa << (23 - 10);
	} else {
	mantissa = 0;
	}
	asInt = sign \| exp \| mantissa;
	return (float )&asInt;
	}
	operator float() { return toFloat(); }

	private:
	uint16_t value;
	};

	struct MatMulF16 {
	using Type = fp16;
	static mlir::Type getMLIRType(MLIRContext &ctx) {
	return FloatType::getF16(&ctx);
	}
	};

	/// Functions to initialize matrix based on the type.
	template <typename T>
	static T getMatA(unsigned idx) {
	if (std::is_same<T, float>::value \|\| std::is_same<T, fp16>::value)
	return ((float)(idx % 5) - 1.0f) / 2.0f;
	else
	return (3 * idx + 1) % 117;
	}

	template <typename T>
	static T getMatB(unsigned idx) {
	if (std::is_same<T, float>::value \|\| std::is_same<T, fp16>::value)
	return ((float)(idx % 7) - 1.0f) / 2.0f;
	else
	return idx % 127;
	}

	template <typename T>
	static bool EqualOrClose(T a, T b) {
	if (std::is_same<T, float>::value \|\| std::is_same<T, fp16>::value)
	return fabs((float)a - (float)b) < 0.001f;
	return a == b;
	}

	static MatmulCodegenStrategy createPowerVRStrategy(int tileM, int tileN,
	int tileK, int warpSize) {
	const std::array<int64_t, 3> nativeSize = {1, 1, 1};
	linalg::LinalgLoopDistributionOptions WIDistribute;
	linalg::LinalgLoopDistributionOptions WGDistribute;
	WGDistribute.distributionMethod = {
	linalg::DistributionMethod::CyclicNumProcsEqNumIters,
	linalg::DistributionMethod::CyclicNumProcsEqNumIters};
	WGDistribute.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;

	WIDistribute.distributionMethod = {
	linalg::DistributionMethod::CyclicNumProcsEqNumIters,
	linalg::DistributionMethod::CyclicNumProcsEqNumIters};
	WIDistribute.procInfo = [warpSize](OpBuilder &b, Location loc,
	ArrayRef<Range> parallelLoopRanges) {
	Type indexType = b.getIndexType();
	SmallVector<linalg::ProcInfo, 2> procInfo(2);
	procInfo[0] = {
	b.create<gpu::ThreadIdOp>(loc, indexType, b.getStringAttr("x")),
	b.create<ConstantIndexOp>(loc, warpSize)};
	procInfo[1] = {b.create<ConstantIndexOp>(loc, 0),
	b.create<ConstantIndexOp>(loc, 1)};
	return procInfo;
	};
	MatmulCodegenStrategy strategy;
	SmallVector<int64_t, 2> promotionList;
	// promote matrix B
	promotionList.push_back(1);
	strategy
	.tile<linalg::MatmulOp>(
	linalg::LinalgTilingOptions()
	.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
	.setTileSizes({tileM, tileN, tileK})
	.setDistributionOptions(WGDistribute))
	.setHoistInvariantCode(enableLICM);
	if (useWorkgroupMemory) {
	strategy.promote<linalg::MatmulOp>(
	linalg::LinalgPromotionOptions()
	.setAllocationDeallocationFns(
	mlir::iree_compiler::allocateWorkgroupMemory,
	mlir::iree_compiler::deallocateWorkgroupMemory)
	.setCopyInOutFns(mlir::iree_compiler::copyToWorkgroupMemory,
	mlir::iree_compiler::copyToWorkgroupMemory)
	.setOperandsToPromote(promotionList)
	.setUseFullTileBuffers({false, false}));
	}
	strategy.tile<linalg::MatmulOp>(
	linalg::LinalgTilingOptions()
	.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
	.setTileSizes({1, tileN, tileK})
	.setDistributionOptions(WIDistribute));
	strategy.vectorize<linalg::MatmulOp>().unrollVector<vector::ContractionOp>(
	nativeSize);
	return strategy;
	}

	static MatmulCodegenStrategy createMaliStrategy(int tileM, int tileN, int tileK,
	int warpSize) {
	const std::array<int64_t, 3> nativeSize = {1, 4, 1};
	linalg::LinalgLoopDistributionOptions WIDistribute;
	linalg::LinalgLoopDistributionOptions WGDistribute;
	WGDistribute.distributionMethod = {
	linalg::DistributionMethod::CyclicNumProcsEqNumIters,
	linalg::DistributionMethod::CyclicNumProcsEqNumIters};
	WGDistribute.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;

	WIDistribute.distributionMethod = {
	linalg::DistributionMethod::CyclicNumProcsEqNumIters,
	linalg::DistributionMethod::CyclicNumProcsEqNumIters};
	WIDistribute.procInfo = [warpSize](OpBuilder &b, Location loc,
	ArrayRef<Range> parallelLoopRanges) {
	Type indexType = b.getIndexType();
	SmallVector<linalg::ProcInfo, 2> procInfo(2);
	procInfo[1] = {
	b.create<gpu::ThreadIdOp>(loc, indexType, b.getStringAttr("x")),
	b.create<ConstantIndexOp>(loc, warpSize)};
	procInfo[0] = {b.create<ConstantIndexOp>(loc, 0),
	b.create<ConstantIndexOp>(loc, 1)};
	return procInfo;
	};
	MatmulCodegenStrategy strategy;
	strategy
	.tile<linalg::MatmulOp>(
	linalg::LinalgTilingOptions()
	.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
	.setTileSizes({tileM, tileN, tileK})
	.setDistributionOptions(WGDistribute))
	.setHoistInvariantCode(enableLICM);
	strategy.tile<linalg::MatmulOp>(
	linalg::LinalgTilingOptions()
	.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
	.setTileSizes({tileM, tileN / warpSize, tileK})
	.setDistributionOptions(WIDistribute));
	strategy.vectorize<linalg::MatmulOp>().unrollVector<vector::ContractionOp>(
	nativeSize);
	return strategy;
	}

	static MatmulCodegenStrategy createTuringStrategy(int tileM, int tileN,
	int tileK) {
	std::array<int64_t, 3> nativeSize;
	if (matType == "i8xi8xi32")
	nativeSize = {16, 16, 32};
	else if (matType == "f16xf16xf16")
	nativeSize = {16, 16, 16};
	else if (matType == "f16xf16xf32")
	nativeSize = {16, 16, 16};
	else
	llvm::errs() << "unsupported matrix type";
	linalg::LinalgLoopDistributionOptions WGDistribute;
	WGDistribute.distributionMethod = {
	linalg::DistributionMethod::CyclicNumProcsEqNumIters,
	linalg::DistributionMethod::CyclicNumProcsEqNumIters};
	WGDistribute.procInfo = getGpuProcIds<gpu::BlockIdOp, gpu::GridDimOp>;

	linalg::LinalgLoopDistributionOptions SGDistribute;
	SGDistribute.distributionMethod = {
	linalg::DistributionMethod::CyclicNumProcsEqNumIters,
	linalg::DistributionMethod::CyclicNumProcsEqNumIters};
	SGDistribute.procInfo = getSubgroupIds;

	MatmulCodegenStrategy strategy;
	strategy
	.tile<linalg::MatmulOp>(
	linalg::LinalgTilingOptions()
	.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
	.setTileSizes({tileM, tileN, tileK})
	.setDistributionOptions(WGDistribute))
	.setHoistInvariantCode(enableLICM);
	if (useWorkgroupMemory) {
	strategy
	.promote<linalg::MatmulOp>(
	linalg::LinalgPromotionOptions()
	.setAllocationDeallocationFns(
	mlir::iree_compiler::allocateWorkgroupMemory,
	mlir::iree_compiler::deallocateWorkgroupMemory)
	.setCopyInOutFns(mlir::iree_compiler::copyToWorkgroupMemory,
	mlir::iree_compiler::copyToWorkgroupMemory)
	.setOperandsToPromote({0, 1})
	.setUseFullTileBuffers({false, false}))
	.tile<linalg::MatmulOp>(
	linalg::LinalgTilingOptions()
	.setLoopType(linalg::LinalgTilingLoopType::ParallelLoops)
	.setTileSizes(
	{tileM / numSubgroupY, tileN / numSubgroupX, tileK})
	.setDistributionOptions(SGDistribute));
	}
	strategy.vectorize<linalg::MatmulOp>().unrollVector<vector::ContractionOp>(
	nativeSize);
	return strategy;
	}

	template <typename SrcT, typename DstT>
	static void matMul(int m, int n, int k, int tileM, int tileN, int tileK,
	bool correctness, int warpSize) {
	const int resRows = m;
	const int resColumns = n;
	const int reductionSize = k;
	StringLiteral funcName = "kernel_matmul";
	ModelBuilder modelBuilder;
	MLIRContext &ctx = *modelBuilder.getContext();
	auto typeA = modelBuilder.getMemRefType({resRows, reductionSize},
	SrcT::getMLIRType(ctx));
	auto typeB = modelBuilder.getMemRefType({reductionSize, resColumns},
	SrcT::getMLIRType(ctx));
	auto typeC =
	modelBuilder.getMemRefType({resRows, resColumns}, DstT::getMLIRType(ctx));
	// 1. Build the kernel.
	{
	modelBuilder.addGPUAttr();
	FuncOp kernelFunc = modelBuilder.makeFunction(
	funcName, {}, {typeA, typeB, typeC}, MLIRFuncOpConfig());
	int workgroupSize;
	if (useWorkgroupMemory)
	workgroupSize = warpSize * numSubgroupX * numSubgroupY;
	else
	workgroupSize = warpSize;
	// Right now we map one workgroup to one warp.
	kernelFunc.setAttr(
	spirv::getEntryPointABIAttrName(),
	spirv::getEntryPointABIAttr({workgroupSize, 1, 1}, &ctx));
	OpBuilder b(&kernelFunc.getBody());
	ScopedContext scope(b, kernelFunc.getLoc());

	auto A = kernelFunc.getArgument(0);
	auto B = kernelFunc.getArgument(1);
	auto C = kernelFunc.getArgument(2);

	linalg_matmul(ValueRange{A, B}, ValueRange{C});
	std_ret();
	}

	// 2. Compile the function, pass in runtime support library to the execution
	// engine for vector.print.
	ModelRunner runner(modelBuilder.getModuleRef(),
	ModelRunner::Target::GPUTarget);
	CompilationOptions options;
	options.loweringPasses = [&](mlir::PassManager &pm) {
	MatmulCodegenStrategy strategy;

	if (target == "powerVR") {
	strategy = createPowerVRStrategy(tileM, tileN, tileK, warpSize);
	} else if (target == "NVTuring") {
	strategy = createTuringStrategy(tileM, tileN, tileK);
	} else if (target == "mali") {
	strategy = createMaliStrategy(tileM, tileN, tileK, warpSize);
	}
	modelBuilder.getModuleRef()->walk(
	[&](FuncOp fn) { strategy.transform(fn); });
	addLoweringPasses(pm, {resColumns / tileN, resRows / tileM, 1},
	{typeA, typeB, typeC});
	};
	runner.compile(options, {vulkanWrapper});

	// 3. Allocate data within data structures that interoperate with the MLIR ABI
	// conventions used by codegen.
	auto initA = [](unsigned idx, typename SrcT::Type *ptr) {
	ptr[idx] = getMatA<typename SrcT::Type>(idx);
	};
	auto initB = [](unsigned idx, typename SrcT::Type *ptr) {
	ptr[idx] = getMatB<typename SrcT::Type>(idx);
	};
	auto zeroInit = [](unsigned idx, typename DstT::Type *ptr) { ptr[idx] = 0; };
	auto A = makeInitializedStridedMemRefDescriptor<typename SrcT::Type, 2>(
	{resRows, reductionSize}, initA);
	auto B = makeInitializedStridedMemRefDescriptor<typename SrcT::Type, 2>(
	{reductionSize, resColumns}, initB);
	auto C = makeInitializedStridedMemRefDescriptor<typename DstT::Type, 2>(
	{resRows, resColumns}, zeroInit);
	auto CPURes = makeInitializedStridedMemRefDescriptor<typename DstT::Type, 2>(
	{resRows, resColumns}, zeroInit);

	// Is checking corretness compare to the value computed on CPU.
	if (correctness) {
	for (int i = 0; i < resRows; i++) {
	for (int j = 0; j < resColumns; j++) {
	typename DstT::Type acc = (*C)[i][j];
	for (int k = 0; k < reductionSize; k++) {
	typename DstT::Type a = (*A)[i][k];
	typename DstT::Type b = (*B)[k][j];
	acc += a * b;
	}
	(*CPURes)[i][j] = acc;
	}
	}
	}

	// 4. Call the funcOp named `funcName`.
	auto err = runner.invoke(std::string(funcName) + "_wrapper", A, B, C);
	if (err) llvm_unreachable("Error running function.");

	if (correctness) {
	bool correct = true;
	for (int i = 0; i < resRows; i++) {
	for (int j = 0; j < resColumns; j++) {
	if (!EqualOrClose((CPURes)[i][j], (C)[i][j])) {
	correct = false;
	llvm::errs() << "mismatch at index(" << i << ", " << j
	<< ") was expecting " << (*CPURes)[i][j] << " but got "
	<< (*C)[i][j] << "\n";
	}
	}
	}
	if (correct) printf("pass\n");
	}
	}

	static void matMul(int m, int n, int k, int tileM, int tileN, int tileK,
	bool correctness, int warpSize) {
	if (matType == "i8xi8xi32") {
	return matMul<MatMulI8, MatMulI32>(m, n, k, tileM, tileN, tileK,
	correctness, warpSize);
	}
	if (matType == "f16xf16xf16") {
	return matMul<MatMulF16, MatMulF16>(m, n, k, tileM, tileN, tileK,
	correctness, warpSize);
	}
	if (matType == "f16xf16xf32") {
	return matMul<MatMulF16, MatMulF32>(m, n, k, tileM, tileN, tileK,
	correctness, warpSize);
	}
	if (matType == "f32xf32xf32") {
	return matMul<MatMulF32, MatMulF32>(m, n, k, tileM, tileN, tileK,
	correctness, warpSize);
	}
	llvm_unreachable("Unsupported matrix type");
	}

	int main(int argc, char **argv) {
	iree::Initializer::RunInitializers();
	// Allow LLVM setup through command line and parse the
	// test specific option for a runtime support library.
	llvm::InitLLVM y(argc, argv);
	llvm::cl::ParseCommandLineOptions(argc, argv, "BenchMatMulVectorGPU\n");
	if (target.empty()) {
	llvm::errs() << "No target specified.";
	return 0;
	}
	int m = 4096;
	int n = 4096;
	int k = 4096;
	if (correctness) {
	m = 256;
	n = 256;
	k = 256;
	}
	int warpSize = 32;
	std::pair<int, int> tileMRange;
	std::pair<int, int> tileNRange;
	std::pair<int, int> tileKRange;
	if (target == "powerVR") {
	m = std::max(m, 1024);
	n = std::max(n, 1024);
	k = std::max(k, 1024);
	tileMRange = {32, 32};
	tileNRange = {32, 32};
	tileKRange = {4, 4};
	} else if (target == "NVTuring") {
	tileMRange = {32, 256};
	tileNRange = {32, 256};
	tileKRange = {32, 64};
	// Workgroup memory requires at least a tile size of 128x128 to be able
	// to do full speed copy from video memory to shared local memory.
	if (useWorkgroupMemory) {
	tileMRange.first = 128;
	tileNRange.first = 128;
	}
	} else if (target == "mali") {
	warpSize = 16;
	tileMRange = {1, 8};
	tileNRange = {64, 128};
	tileKRange = {4, 4};
	} else {
	llvm::errs() << "Unknown target";
	return 0;
	}

	printf("Matrix size: %ix%ix%i\n", m, n, k);
	for (int tileK = tileKRange.first; tileK <= tileKRange.second; tileK *= 2) {
	for (int tileM = tileMRange.first; tileM <= tileMRange.second; tileM *= 2) {
	for (int tileN = tileNRange.first; tileN <= tileNRange.second;
	tileN *= 2) {
	printf("tileM=%i tileN=%i tileK=%i\n", tileM, tileN, tileK);
	// For non-power of two tile sizes, round up the matrix size to
	// be an even multiple of the tile size.
	// TODO(thomasraoux): enable non power of two tiles once affine.min
	// folding is fixed.
	auto paddedM = (m + tileM - 1) / tileM * tileM;
	auto paddedN = (n + tileN - 1) / tileN * tileN;
	auto paddedK = (k + tileK - 1) / tileK * tileK;

	matMul(paddedM, paddedN, paddedK, tileM, tileN, tileK, correctness,
	warpSize);
	}
	}
	}
	}