experimental/ModelBuilder/test/BenchMatMulVectorColumnMajorLLVMIntrinsicsJIT.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2020 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "benchmark/benchmark.h"
 #include "experimental/ModelBuilder/MemRefUtils.h"
 #include "experimental/ModelBuilder/ModelBuilder.h"
 #include "experimental/ModelBuilder/ModelRunner.h"

 using namespace mlir;  // NOLINT

 // Helper method to construct an affine map.
 static SmallVector<AffineMap, 3> makeColumnMajorMatmulMaps(ModelBuilder &mb) {
   AffineExpr m, n, k;
   bindDims(mb.getContext(), m, n, k);
   SmallVector<AffineMap, 3> results;
   results.push_back(AffineMap::get(3, 0, {k, n}, mb.getContext()));
   results.push_back(AffineMap::get(3, 0, {m, k}, mb.getContext()));
   results.push_back(AffineMap::get(3, 0, {n, m}, mb.getContext()));
   return results;
 }

 // Helper method to build a matrix-matrix column-major multiplication function
 // using the vector dialect and that runs ITERS times to amortize any calling
 // overhead.
 template <unsigned M, unsigned N, unsigned K, unsigned ITERS>
 void buildMatMat(ModelBuilder &mb, StringLiteral fn) {
   auto f32 = mb.f32;
   auto mkVectorType = mb.getVectorType({M, K}, f32);
   auto typeA = mb.getMemRefType({}, mkVectorType);
   auto knVectorType = mb.getVectorType({K, N}, f32);
   auto typeB = mb.getMemRefType({}, knVectorType);
   auto mnVectorType = mb.getVectorType({M, N}, f32);
   auto typeC = mb.getMemRefType({}, mnVectorType);

   auto f = mb.makeFunction(
       fn, {}, {typeA, typeB, typeC},
       MLIRFuncOpConfig().setEmitCInterface(true).setPreferAvx512(true));
   OpBuilder b(&f.getBody());
   ScopedContext scope(b, f.getLoc());

   // Build the following accesses:
   //   affine_map<(m, n, k) -> (k, m)>,
   //   affine_map<(m, n, k) -> (n, k)>,
   //   affine_map<(m, n, k) -> (n, m)>
   SmallVector<AffineMap, 4> accesses = makeColumnMajorMatmulMaps(mb);

   // Build the following iterator types:
   //   iterator_types = ["parallel", "parallel", "reduction"]
   SmallVector<Attribute, 4> iterator_types;
   iterator_types.push_back(mb.getStringAttr("parallel"));
   iterator_types.push_back(mb.getStringAttr("parallel"));
   iterator_types.push_back(mb.getStringAttr("reduction"));

   // Loop ITERS times over the kernel to reduce the JIT's overhead.
   StdIndexedValue A(f.getArgument(0)), B(f.getArgument(1)), C(f.getArgument(2));
   loopNestBuilder(std_constant_index(0), std_constant_index(ITERS),
                   std_constant_index(1), [&](Value) {
                     // Compute C += A x B, in column-major form, with LLVM
                     // matrix intrinsics.
                     C() = (vector_contract(A(), B(), C(),
                                            mb.getAffineMapArrayAttr(accesses),
                                            mb.getArrayAttr(iterator_types)));
                   });
   std_ret();
 }

 // Benchmark method.
 template <unsigned M, unsigned N, unsigned K, bool MeasureBuild,
           bool LowerToLLVMMatrixIntrinsics>
 void BM_MxMColMajorVectors(benchmark::State &state) {
   constexpr unsigned NumMxMPerIteration = 1000;
   state.counters["NumMxM/Iter"] = NumMxMPerIteration;
   // Column major vector types.
   using TypeLHS = Vector2D<K, M, float>;
   using TypeRHS = Vector2D<N, K, float>;
   using TypeRES = Vector2D<N, M, float>;
   // Prepare arguments beforehand.
   auto oneInit = [](unsigned idx, TypeLHS *ptr) {
     float *p = reinterpret_cast<float *>(ptr + idx);
     for (unsigned i = 0; i < M * N; ++i) p[i] = 1.0f;
   };
   auto incInit = [](unsigned idx, TypeRHS *ptr) {
     float *p = reinterpret_cast<float *>(ptr + idx);
     for (unsigned i = 0; i < M * N; ++i) p[i] = 1.0f + i;
   };
   auto zeroInit = [](unsigned idx, TypeRES *ptr) {
     float *p = reinterpret_cast<float *>(ptr + idx);
     for (unsigned i = 0; i < M * N; ++i) p[i] = 0.0f;
   };
   auto A = makeInitializedStridedMemRefDescriptor<TypeLHS, 1>({1}, oneInit);
   auto B = makeInitializedStridedMemRefDescriptor<TypeRHS, 1>({1}, incInit);
   auto C = makeInitializedStridedMemRefDescriptor<TypeRES, 1>({1}, zeroInit);
   StringLiteral funcName = "matmult_column_major";

   vector::VectorTransformsOptions vectorTransformsOptions{
       LowerToLLVMMatrixIntrinsics ? vector::VectorContractLowering::Matmul
                                   : vector::VectorContractLowering::Dot};
   CompilationOptions compilationOptions{/*llvmOptLevel=*/3, /*llcOptLevel=*/3,
                                         vectorTransformsOptions};
   if (MeasureBuild) {
     // If this is a build-time benchmark, build, compile, and execute
     // the function inside the timed loop, building a fresh new function
     // in each iteration to get the full JIT time (keep I == 1 here).
     for (auto _ : state) {
       ModelBuilder builder;
       buildMatMat<M, N, K, 1>(builder, funcName);
       ModelRunner runner(builder.getModuleRef());
       runner.compile(compilationOptions);
       auto err = runner.invoke(funcName, A, B, C);
       if (err) llvm_unreachable("Error compiling/running function.");
     }
   } else {
     // If this is a run-time benchmark, build, compile, and execute
     // the function once outside the timed loop, then continue running
     // the same function inside the loop to focus on actual runtime
     // (set I == NumIterations here to amortize calling overhead).
     ModelBuilder builder;
     buildMatMat<M, N, K, NumMxMPerIteration>(builder, funcName);
     ModelRunner runner(builder.getModuleRef());
     runner.compile(compilationOptions);
     auto err = runner.invoke(funcName, A, B, C);
     if (err) llvm_unreachable("Error compiling/running function.");
     for (auto _ : state) {
       auto err_run = runner.invoke(funcName, A, B, C);
       if (err_run) llvm_unreachable("Error running function.");
     }
   }
 }

 int main(int argc, char **argv) {
   ::benchmark::Initialize(&argc, argv);
   if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1;
   ::benchmark::RunSpecifiedBenchmarks();
 }

 //
 // Benchmark drivers (build).
 //

 #define BENCHMARK_MATMUL_COLUMN_MAJOR(SZ_M, SZ_N, SZ_K)                      \
   BENCHMARK_TEMPLATE(BM_MxMColMajorVectors, SZ_M, SZ_N, SZ_K, true, false);  \
   BENCHMARK_TEMPLATE(BM_MxMColMajorVectors, SZ_M, SZ_N, SZ_K, true, true);   \
   BENCHMARK_TEMPLATE(BM_MxMColMajorVectors, SZ_M, SZ_N, SZ_K, false, false); \
   BENCHMARK_TEMPLATE(BM_MxMColMajorVectors, SZ_M, SZ_N, SZ_K, false, true);

 BENCHMARK_MATMUL_COLUMN_MAJOR(1, 1, 1);
 BENCHMARK_MATMUL_COLUMN_MAJOR(2, 2, 2);
 BENCHMARK_MATMUL_COLUMN_MAJOR(4, 4, 4);
 BENCHMARK_MATMUL_COLUMN_MAJOR(8, 8, 8);
 BENCHMARK_MATMUL_COLUMN_MAJOR(16, 16, 16);
	// Copyright 2020 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "benchmark/benchmark.h"
	#include "experimental/ModelBuilder/MemRefUtils.h"
	#include "experimental/ModelBuilder/ModelBuilder.h"
	#include "experimental/ModelBuilder/ModelRunner.h"

	using namespace mlir; // NOLINT

	// Helper method to construct an affine map.
	static SmallVector<AffineMap, 3> makeColumnMajorMatmulMaps(ModelBuilder &mb) {
	AffineExpr m, n, k;
	bindDims(mb.getContext(), m, n, k);
	SmallVector<AffineMap, 3> results;
	results.push_back(AffineMap::get(3, 0, {k, n}, mb.getContext()));
	results.push_back(AffineMap::get(3, 0, {m, k}, mb.getContext()));
	results.push_back(AffineMap::get(3, 0, {n, m}, mb.getContext()));
	return results;
	}

	// Helper method to build a matrix-matrix column-major multiplication function
	// using the vector dialect and that runs ITERS times to amortize any calling
	// overhead.
	template <unsigned M, unsigned N, unsigned K, unsigned ITERS>
	void buildMatMat(ModelBuilder &mb, StringLiteral fn) {
	auto f32 = mb.f32;
	auto mkVectorType = mb.getVectorType({M, K}, f32);
	auto typeA = mb.getMemRefType({}, mkVectorType);
	auto knVectorType = mb.getVectorType({K, N}, f32);
	auto typeB = mb.getMemRefType({}, knVectorType);
	auto mnVectorType = mb.getVectorType({M, N}, f32);
	auto typeC = mb.getMemRefType({}, mnVectorType);

	auto f = mb.makeFunction(
	fn, {}, {typeA, typeB, typeC},
	MLIRFuncOpConfig().setEmitCInterface(true).setPreferAvx512(true));
	OpBuilder b(&f.getBody());
	ScopedContext scope(b, f.getLoc());

	// Build the following accesses:
	// affine_map<(m, n, k) -> (k, m)>,
	// affine_map<(m, n, k) -> (n, k)>,
	// affine_map<(m, n, k) -> (n, m)>
	SmallVector<AffineMap, 4> accesses = makeColumnMajorMatmulMaps(mb);

	// Build the following iterator types:
	// iterator_types = ["parallel", "parallel", "reduction"]
	SmallVector<Attribute, 4> iterator_types;
	iterator_types.push_back(mb.getStringAttr("parallel"));
	iterator_types.push_back(mb.getStringAttr("parallel"));
	iterator_types.push_back(mb.getStringAttr("reduction"));

	// Loop ITERS times over the kernel to reduce the JIT's overhead.
	StdIndexedValue A(f.getArgument(0)), B(f.getArgument(1)), C(f.getArgument(2));
	loopNestBuilder(std_constant_index(0), std_constant_index(ITERS),
	std_constant_index(1), [&](Value) {
	// Compute C += A x B, in column-major form, with LLVM
	// matrix intrinsics.
	C() = (vector_contract(A(), B(), C(),
	mb.getAffineMapArrayAttr(accesses),
	mb.getArrayAttr(iterator_types)));
	});
	std_ret();
	}

	// Benchmark method.
	template <unsigned M, unsigned N, unsigned K, bool MeasureBuild,
	bool LowerToLLVMMatrixIntrinsics>
	void BM_MxMColMajorVectors(benchmark::State &state) {
	constexpr unsigned NumMxMPerIteration = 1000;
	state.counters["NumMxM/Iter"] = NumMxMPerIteration;
	// Column major vector types.
	using TypeLHS = Vector2D<K, M, float>;
	using TypeRHS = Vector2D<N, K, float>;
	using TypeRES = Vector2D<N, M, float>;
	// Prepare arguments beforehand.
	auto oneInit = [](unsigned idx, TypeLHS *ptr) {
	float p = reinterpret_cast<float >(ptr + idx);
	for (unsigned i = 0; i < M * N; ++i) p[i] = 1.0f;
	};
	auto incInit = [](unsigned idx, TypeRHS *ptr) {
	float p = reinterpret_cast<float >(ptr + idx);
	for (unsigned i = 0; i < M * N; ++i) p[i] = 1.0f + i;
	};
	auto zeroInit = [](unsigned idx, TypeRES *ptr) {
	float p = reinterpret_cast<float >(ptr + idx);
	for (unsigned i = 0; i < M * N; ++i) p[i] = 0.0f;
	};
	auto A = makeInitializedStridedMemRefDescriptor<TypeLHS, 1>({1}, oneInit);
	auto B = makeInitializedStridedMemRefDescriptor<TypeRHS, 1>({1}, incInit);
	auto C = makeInitializedStridedMemRefDescriptor<TypeRES, 1>({1}, zeroInit);
	StringLiteral funcName = "matmult_column_major";

	vector::VectorTransformsOptions vectorTransformsOptions{
	LowerToLLVMMatrixIntrinsics ? vector::VectorContractLowering::Matmul
	: vector::VectorContractLowering::Dot};
	CompilationOptions compilationOptions{/llvmOptLevel=/3, /llcOptLevel=/3,
	vectorTransformsOptions};
	if (MeasureBuild) {
	// If this is a build-time benchmark, build, compile, and execute
	// the function inside the timed loop, building a fresh new function
	// in each iteration to get the full JIT time (keep I == 1 here).
	for (auto _ : state) {
	ModelBuilder builder;
	buildMatMat<M, N, K, 1>(builder, funcName);
	ModelRunner runner(builder.getModuleRef());
	runner.compile(compilationOptions);
	auto err = runner.invoke(funcName, A, B, C);
	if (err) llvm_unreachable("Error compiling/running function.");
	}
	} else {
	// If this is a run-time benchmark, build, compile, and execute
	// the function once outside the timed loop, then continue running
	// the same function inside the loop to focus on actual runtime
	// (set I == NumIterations here to amortize calling overhead).
	ModelBuilder builder;
	buildMatMat<M, N, K, NumMxMPerIteration>(builder, funcName);
	ModelRunner runner(builder.getModuleRef());
	runner.compile(compilationOptions);
	auto err = runner.invoke(funcName, A, B, C);
	if (err) llvm_unreachable("Error compiling/running function.");
	for (auto _ : state) {
	auto err_run = runner.invoke(funcName, A, B, C);
	if (err_run) llvm_unreachable("Error running function.");
	}
	}
	}

	int main(int argc, char **argv) {
	::benchmark::Initialize(&argc, argv);
	if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1;
	::benchmark::RunSpecifiedBenchmarks();
	}

	//
	// Benchmark drivers (build).
	//

	#define BENCHMARK_MATMUL_COLUMN_MAJOR(SZ_M, SZ_N, SZ_K) \
	BENCHMARK_TEMPLATE(BM_MxMColMajorVectors, SZ_M, SZ_N, SZ_K, true, false); \
	BENCHMARK_TEMPLATE(BM_MxMColMajorVectors, SZ_M, SZ_N, SZ_K, true, true); \
	BENCHMARK_TEMPLATE(BM_MxMColMajorVectors, SZ_M, SZ_N, SZ_K, false, false); \
	BENCHMARK_TEMPLATE(BM_MxMColMajorVectors, SZ_M, SZ_N, SZ_K, false, true);

	BENCHMARK_MATMUL_COLUMN_MAJOR(1, 1, 1);
	BENCHMARK_MATMUL_COLUMN_MAJOR(2, 2, 2);
	BENCHMARK_MATMUL_COLUMN_MAJOR(4, 4, 4);
	BENCHMARK_MATMUL_COLUMN_MAJOR(8, 8, 8);
	BENCHMARK_MATMUL_COLUMN_MAJOR(16, 16, 16);