|  | // Copyright 2020 Google LLC | 
|  | // | 
|  | // Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | // you may not use this file except in compliance with the License. | 
|  | // You may obtain a copy of the License at | 
|  | // | 
|  | //      https://www.apache.org/licenses/LICENSE-2.0 | 
|  | // | 
|  | // Unless required by applicable law or agreed to in writing, software | 
|  | // distributed under the License is distributed on an "AS IS" BASIS, | 
|  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | // See the License for the specific language governing permissions and | 
|  | // limitations under the License. | 
|  |  | 
|  | #include "benchmark/benchmark.h" | 
|  | #include "experimental/ModelBuilder/MemRefUtils.h" | 
|  | #include "experimental/ModelBuilder/ModelBuilder.h" | 
|  | #include "experimental/ModelBuilder/ModelRunner.h" | 
|  |  | 
|  | using namespace mlir;  // NOLINT | 
|  |  | 
|  | // Helper method to construct an affine map. | 
|  | static SmallVector<AffineMap, 3> makeColumnMajorMatmulMaps(ModelBuilder &mb) { | 
|  | AffineExpr m, n, k; | 
|  | bindDims(mb.getContext(), m, n, k); | 
|  | SmallVector<AffineMap, 3> results; | 
|  | results.push_back(AffineMap::get(3, 0, {k, n}, mb.getContext())); | 
|  | results.push_back(AffineMap::get(3, 0, {m, k}, mb.getContext())); | 
|  | results.push_back(AffineMap::get(3, 0, {n, m}, mb.getContext())); | 
|  | return results; | 
|  | } | 
|  |  | 
|  | // Helper method to build a matrix-matrix column-major multiplication function | 
|  | // using the vector dialect and that runs ITERS times to amortize any calling | 
|  | // overhead. | 
|  | template <unsigned M, unsigned N, unsigned K, unsigned ITERS> | 
|  | void buildMatMat(ModelBuilder &mb, StringLiteral fn) { | 
|  | auto f32 = mb.f32; | 
|  | auto mkVectorType = mb.getVectorType({M, K}, f32); | 
|  | auto typeA = mb.getMemRefType({}, mkVectorType); | 
|  | auto knVectorType = mb.getVectorType({K, N}, f32); | 
|  | auto typeB = mb.getMemRefType({}, knVectorType); | 
|  | auto mnVectorType = mb.getVectorType({M, N}, f32); | 
|  | auto typeC = mb.getMemRefType({}, mnVectorType); | 
|  |  | 
|  | auto f = mb.makeFunction( | 
|  | fn, {}, {typeA, typeB, typeC}, | 
|  | MLIRFuncOpConfig().setEmitCInterface(true).setPreferAvx512(true)); | 
|  | OpBuilder b(&f.getBody()); | 
|  | ScopedContext scope(b, f.getLoc()); | 
|  |  | 
|  | // Build the following accesses: | 
|  | //   affine_map<(m, n, k) -> (k, m)>, | 
|  | //   affine_map<(m, n, k) -> (n, k)>, | 
|  | //   affine_map<(m, n, k) -> (n, m)> | 
|  | SmallVector<AffineMap, 4> accesses = makeColumnMajorMatmulMaps(mb); | 
|  |  | 
|  | // Build the following iterator types: | 
|  | //   iterator_types = ["parallel", "parallel", "reduction"] | 
|  | SmallVector<Attribute, 4> iterator_types; | 
|  | iterator_types.push_back(mb.getStringAttr("parallel")); | 
|  | iterator_types.push_back(mb.getStringAttr("parallel")); | 
|  | iterator_types.push_back(mb.getStringAttr("reduction")); | 
|  |  | 
|  | // Loop ITERS times over the kernel to reduce the JIT's overhead. | 
|  | StdIndexedValue A(f.getArgument(0)), B(f.getArgument(1)), C(f.getArgument(2)); | 
|  | loopNestBuilder(std_constant_index(0), std_constant_index(ITERS), | 
|  | std_constant_index(1), [&](Value) { | 
|  | // Compute C += A x B, in column-major form, with LLVM | 
|  | // matrix intrinsics. | 
|  | C() = (vector_contract(A(), B(), C(), | 
|  | mb.getAffineMapArrayAttr(accesses), | 
|  | mb.getArrayAttr(iterator_types))); | 
|  | }); | 
|  | std_ret(); | 
|  | } | 
|  |  | 
|  | // Benchmark method. | 
|  | template <unsigned M, unsigned N, unsigned K, bool MeasureBuild, | 
|  | bool LowerToLLVMMatrixIntrinsics> | 
|  | void BM_MxMColMajorVectors(benchmark::State &state) { | 
|  | constexpr unsigned NumMxMPerIteration = 1000; | 
|  | state.counters["NumMxM/Iter"] = NumMxMPerIteration; | 
|  | // Column major vector types. | 
|  | using TypeLHS = Vector2D<K, M, float>; | 
|  | using TypeRHS = Vector2D<N, K, float>; | 
|  | using TypeRES = Vector2D<N, M, float>; | 
|  | // Prepare arguments beforehand. | 
|  | auto oneInit = [](unsigned idx, TypeLHS *ptr) { | 
|  | float *p = reinterpret_cast<float *>(ptr + idx); | 
|  | for (unsigned i = 0; i < M * N; ++i) p[i] = 1.0f; | 
|  | }; | 
|  | auto incInit = [](unsigned idx, TypeRHS *ptr) { | 
|  | float *p = reinterpret_cast<float *>(ptr + idx); | 
|  | for (unsigned i = 0; i < M * N; ++i) p[i] = 1.0f + i; | 
|  | }; | 
|  | auto zeroInit = [](unsigned idx, TypeRES *ptr) { | 
|  | float *p = reinterpret_cast<float *>(ptr + idx); | 
|  | for (unsigned i = 0; i < M * N; ++i) p[i] = 0.0f; | 
|  | }; | 
|  | auto A = makeInitializedStridedMemRefDescriptor<TypeLHS, 1>({1}, oneInit); | 
|  | auto B = makeInitializedStridedMemRefDescriptor<TypeRHS, 1>({1}, incInit); | 
|  | auto C = makeInitializedStridedMemRefDescriptor<TypeRES, 1>({1}, zeroInit); | 
|  | StringLiteral funcName = "matmult_column_major"; | 
|  |  | 
|  | vector::VectorTransformsOptions vectorTransformsOptions{ | 
|  | LowerToLLVMMatrixIntrinsics ? vector::VectorContractLowering::Matmul | 
|  | : vector::VectorContractLowering::Dot}; | 
|  | CompilationOptions compilationOptions{/*llvmOptLevel=*/3, /*llcOptLevel=*/3, | 
|  | vectorTransformsOptions}; | 
|  | if (MeasureBuild) { | 
|  | // If this is a build-time benchmark, build, compile, and execute | 
|  | // the function inside the timed loop, building a fresh new function | 
|  | // in each iteration to get the full JIT time (keep I == 1 here). | 
|  | for (auto _ : state) { | 
|  | ModelBuilder builder; | 
|  | buildMatMat<M, N, K, 1>(builder, funcName); | 
|  | ModelRunner runner(builder.getModuleRef()); | 
|  | runner.compile(compilationOptions); | 
|  | auto err = runner.invoke(funcName, A, B, C); | 
|  | if (err) llvm_unreachable("Error compiling/running function."); | 
|  | } | 
|  | } else { | 
|  | // If this is a run-time benchmark, build, compile, and execute | 
|  | // the function once outside the timed loop, then continue running | 
|  | // the same function inside the loop to focus on actual runtime | 
|  | // (set I == NumIterations here to amortize calling overhead). | 
|  | ModelBuilder builder; | 
|  | buildMatMat<M, N, K, NumMxMPerIteration>(builder, funcName); | 
|  | ModelRunner runner(builder.getModuleRef()); | 
|  | runner.compile(compilationOptions); | 
|  | auto err = runner.invoke(funcName, A, B, C); | 
|  | if (err) llvm_unreachable("Error compiling/running function."); | 
|  | for (auto _ : state) { | 
|  | auto err_run = runner.invoke(funcName, A, B, C); | 
|  | if (err_run) llvm_unreachable("Error running function."); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | int main(int argc, char **argv) { | 
|  | ::benchmark::Initialize(&argc, argv); | 
|  | if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; | 
|  | ::benchmark::RunSpecifiedBenchmarks(); | 
|  | } | 
|  |  | 
|  | // | 
|  | // Benchmark drivers (build). | 
|  | // | 
|  |  | 
|  | #define BENCHMARK_MATMUL_COLUMN_MAJOR(SZ_M, SZ_N, SZ_K)                      \ | 
|  | BENCHMARK_TEMPLATE(BM_MxMColMajorVectors, SZ_M, SZ_N, SZ_K, true, false);  \ | 
|  | BENCHMARK_TEMPLATE(BM_MxMColMajorVectors, SZ_M, SZ_N, SZ_K, true, true);   \ | 
|  | BENCHMARK_TEMPLATE(BM_MxMColMajorVectors, SZ_M, SZ_N, SZ_K, false, false); \ | 
|  | BENCHMARK_TEMPLATE(BM_MxMColMajorVectors, SZ_M, SZ_N, SZ_K, false, true); | 
|  |  | 
|  | BENCHMARK_MATMUL_COLUMN_MAJOR(1, 1, 1); | 
|  | BENCHMARK_MATMUL_COLUMN_MAJOR(2, 2, 2); | 
|  | BENCHMARK_MATMUL_COLUMN_MAJOR(4, 4, 4); | 
|  | BENCHMARK_MATMUL_COLUMN_MAJOR(8, 8, 8); | 
|  | BENCHMARK_MATMUL_COLUMN_MAJOR(16, 16, 16); |