blob: 027cad14e56c77a244cf2cd5df1032836acf760b [file]
// Copyright 2022 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Design rationale and code creep warning!
//
// Summary:
//
// The goal of this test is to provide 100% coverage across all
// internal kernel variants, which is not convenient to do in e2e tests.
// Resist the temptation to reimplement here all the niceties of the e2e test.
// Stick to guaranteeing that if the test succeeds, then the mmt4d builtin,
// with all its asm code path variants, is correct. In case of failure, the
// user is expected to be happy to jump into a debugger.
//
// Longer story:
//
// It is said by an ancient prophecy that all matrix multiplication tests grow
// to be thousands of lines of code.
//
// In fact, we already have one, it's the end-to-end matmul test under
// iree/tests/e2e/matmul. That one is needed anyway, and needs to be large
// anyway, being end-to-end and applying to all target backends, including those
// where device!=host. And so it makes sense for that one to have extra bells
// and whistles such as fuzzy comparisons, pretty-printing of numerical errors
// to aid debugging, and yet more special logic to make numerical errors easier
// to debug.
//
// Let's not duplicate all that here! Note also that, tempting as it would
// be to borrow the matrix-pretty-printing stuff from e2e/matmul, that applies
// to plain row-major 2D matrices, while here we are dealing with 4D arrays /
// tiled-layout matrices. Trying to bridge over that difference would bring yet
// more complexity.
//
// Instead, let us keep a sharp focus on why we need this separate micro test.
// The motivation is not the usual "because micro tests are easier to debug than
// e2e" but rather because it would be difficult to have 100% code coverage in
// e2e. There are many variants of mmt4d builtin ukernels for various CPU
// features and tuned for various CPU models. We have to iterate over all these
// variants. Trying to do so in e2e tests would require exposing knobs for
// things that we would otherwise prefer to keep internal in the mmt4d builtin
// implementation, and would make e2e/matmul tests even more expensive.
#include <vector>
#include "iree/base/api.h"
#include "iree/base/internal/cpu.h"
#include "iree/builtins/ukernel/api.h"
#include "iree/builtins/ukernel/tools/ukernel_test_utils.h"
#include "iree/testing/gtest.h"
#include "iree/testing/status_matchers.h"
template <typename lhs_t, typename rhs_t, typename out_t>
static void iree_mmt4d_reference(const iree_uk_mmt4d_params_t& params) {
bool accumulate = params.flags & IREE_UK_FLAG_ACCUMULATE;
iree_uk_ssize_t lhs_tile_size = params.M0 * params.K0;
iree_uk_ssize_t rhs_tile_size = params.N0 * params.K0;
iree_uk_ssize_t out_tile_size = params.M0 * params.N0;
for (iree_uk_ssize_t i = 0; i < params.M; ++i) {
for (iree_uk_ssize_t j = 0; j < params.N; ++j) {
out_t* out_tile_ptr = ((out_t*)params.out_buffer) +
i * params.out_stride + j * out_tile_size;
const lhs_t* lhs_panel_ptr =
((const lhs_t*)params.lhs_buffer) + i * params.lhs_stride;
const rhs_t* rhs_panel_ptr =
((const rhs_t*)params.rhs_buffer) + j * params.rhs_stride;
for (iree_uk_ssize_t i0 = 0; i0 < params.M0; ++i0) {
for (iree_uk_ssize_t j0 = 0; j0 < params.N0; ++j0) {
const lhs_t* lhs_tile_ptr = lhs_panel_ptr;
const rhs_t* rhs_tile_ptr = rhs_panel_ptr;
out_t* out_ptr = out_tile_ptr + i0 * params.N0 + j0;
out_t acc = accumulate ? *out_ptr : 0.f;
for (iree_uk_ssize_t k = 0; k < params.K; ++k) {
for (iree_uk_ssize_t k0 = 0; k0 < params.K0; ++k0) {
out_t lhs_val = lhs_tile_ptr[i0 * params.K0 + k0];
out_t rhs_val = rhs_tile_ptr[j0 * params.K0 + k0];
acc += lhs_val * rhs_val;
}
lhs_tile_ptr += lhs_tile_size;
rhs_tile_ptr += rhs_tile_size;
}
*out_ptr = acc;
}
}
}
}
}
static void iree_mmt4d_reference(const iree_uk_mmt4d_params_t& params) {
switch (params.type) {
case iree_uk_mmt4d_type_f32f32f32:
iree_mmt4d_reference<float, float, float>(params);
break;
case iree_uk_mmt4d_type_i8i8i32:
iree_mmt4d_reference<iree_uk_int8_t, iree_uk_int8_t, iree_uk_int32_t>(
params);
break;
default:
assert(false && "unknown type");
}
}
static void test_one_matmul_using_given_lhs_rhs(
const iree_uk_mmt4d_params_t& shared_params,
iree_uk_test_random_engine_t* engine) {
assert(!shared_params.out_buffer);
iree_uk_mmt4d_params_t reference_params;
memcpy(&reference_params, &shared_params, sizeof shared_params);
iree_uk_type_t out_type = iree_uk_mmt4d_out_type(shared_params.type);
iree_uk_ssize_t out_buffer_size = iree_uk_test_2d_buffer_length(
out_type, shared_params.M, shared_params.out_stride);
reference_params.out_buffer = malloc(out_buffer_size);
iree_uk_test_write_random_buffer(reference_params.out_buffer, out_buffer_size,
out_type, engine);
iree_uk_mmt4d_params_t actual_params;
memcpy(&actual_params, &shared_params, sizeof shared_params);
actual_params.out_buffer = malloc(out_buffer_size);
memcpy(actual_params.out_buffer, reference_params.out_buffer,
out_buffer_size);
iree_mmt4d_reference(reference_params);
iree_uk_mmt4d(&actual_params);
// For now we use exact comparisons, even for float, even though the reference
// code accumulates in a different order compared to the actual code. This
// relies on picking input test matrix elements so that all intermediate
// values are exactly representable - i.e. small integer numerators. This
// become problematic when we do float16. See the comment at the top of this
// file explaining how we refrain from letting this grow into a 1000-line-long
// fully-featured test.
if (memcmp(actual_params.out_buffer, reference_params.out_buffer,
out_buffer_size)) {
const auto& p = actual_params;
fprintf(stderr, "mmt4d test failure with the following params:\n");
char types_str[32];
iree_uk_test_type_triple_str(types_str, sizeof types_str, p.type);
fprintf(stderr, " types: %s\n", types_str);
fprintf(stderr, " flags: accumulate=%d\n",
(bool)(p.flags & IREE_UK_FLAG_ACCUMULATE));
fprintf(stderr, " M=%d, N=%d, K=%d\n", (int)p.M, (int)p.N, (int)p.K);
fprintf(stderr, " M0=%d, N0=%d, K0=%d\n", (int)p.M0, (int)p.N0, (int)p.K0);
fprintf(stderr, " lhs_stride=%zu, rhs_stride=%zu, out_stride=%zu\n",
(size_t)p.lhs_stride, (size_t)p.rhs_stride, (size_t)p.out_stride);
char cpu_feat_str[32];
iree_uk_test_cpu_features_str(cpu_feat_str, sizeof cpu_feat_str, p.cpu_data,
1);
fprintf(stderr, " cpu features: %s\n", cpu_feat_str);
// Don't even try to pretty-print matrices. See the comment at the top of
// this file. Don't try to use GTest primitives to show expected vs actual
// since that would require dispatching to type-specific code paths.
// Also, at this point it's easy for the user to rerun this test
// in a debugger and manually inspect values.
//
// We want fatal here - that is what the user running this in a debugger
// wants us to do, so they can inspect values while they exist in memory.
// What's the GTest-sanctioned fatal error? GTEST_FAIL() has a comment that
// says that it's fatal, but that's a lie at least here on Android.
iree_abort();
}
free(reference_params.out_buffer);
free(actual_params.out_buffer);
}
static void test_one_matmul_creating_lhs_rhs_for_given_shape(
const iree_uk_mmt4d_params_t& shared_params,
iree_uk_test_random_engine_t* engine) {
iree_uk_mmt4d_params_t params;
memcpy(&params, &shared_params, sizeof params);
assert(!params.lhs_buffer);
assert(!params.rhs_buffer);
assert(!params.out_buffer);
assert(!params.lhs_stride);
assert(!params.rhs_stride);
assert(!params.out_stride);
// Populate strides first - we need them below to compute buffer lengths.
// Randomly make strides either tight or not to exercise all cases.
params.lhs_stride = params.K * params.M0 * params.K0 +
iree_uk_test_random_engine_get_0_1(engine);
params.rhs_stride = params.K * params.N0 * params.K0 +
iree_uk_test_random_engine_get_0_1(engine);
params.out_stride = params.N * params.M0 * params.N0 +
iree_uk_test_random_engine_get_0_1(engine);
iree_uk_type_t lhs_type = iree_uk_mmt4d_lhs_type(params.type);
iree_uk_type_t rhs_type = iree_uk_mmt4d_rhs_type(params.type);
iree_uk_ssize_t lhs_buffer_size =
iree_uk_test_2d_buffer_length(lhs_type, params.M, params.lhs_stride);
iree_uk_ssize_t rhs_buffer_size =
iree_uk_test_2d_buffer_length(rhs_type, params.N, params.rhs_stride);
void* lhs_buffer = malloc(lhs_buffer_size);
void* rhs_buffer = malloc(rhs_buffer_size);
iree_uk_test_write_random_buffer(lhs_buffer, lhs_buffer_size, lhs_type,
engine);
iree_uk_test_write_random_buffer(rhs_buffer, rhs_buffer_size, rhs_type,
engine);
params.lhs_buffer = lhs_buffer;
params.rhs_buffer = rhs_buffer;
test_one_matmul_using_given_lhs_rhs(params, engine);
free(lhs_buffer);
free(rhs_buffer);
}
static void test_matmuls_for_various_MNK_shapes_and_flags(
const iree_uk_mmt4d_params_t& shared_params,
iree_uk_test_random_engine_t* engine) {
iree_uk_mmt4d_params_t params;
memcpy(&params, &shared_params, sizeof params);
assert(params.M == 0);
assert(params.N == 0);
assert(params.K == 0);
assert(params.flags == 0);
struct shape_mnk_t {
int m, n, k;
};
std::vector<shape_mnk_t> shapes{
// Degenerate case M==0. Vacuous.
{0, 1, 1},
{0, 5, 7},
// Degenerate case N==0. Vacuous.
{1, 0, 1},
{5, 0, 7},
// Degenerate case K==0. Vacuous if flags have ACCUMULATE. Zeroing the
// output buffer otherwise.
{1, 1, 0},
{5, 7, 0},
// Non-degenerate cases.
{1, 1, 1},
{1, 1, 2},
{1, 1, 10},
{1, 1, 1000},
{2, 1, 1},
{1, 2, 1},
{2, 2, 2},
{5, 7, 13},
};
for (shape_mnk_t shape : shapes) {
params.M = shape.m;
params.N = shape.n;
params.K = shape.k;
for (bool accumulate : {false, true}) {
params.flags = accumulate ? IREE_UK_FLAG_ACCUMULATE : 0;
test_one_matmul_creating_lhs_rhs_for_given_shape(params, engine);
}
}
}
// Tests mmt4d with the specific data type and specific M0xN0xK0 tile format.
// If cpu_data_field_0_bit is nonzero, it must then be a single bit (power of 2)
// and if the CPU supports the corresponding feature, the mmt4d tests are run a
// second time with that CPU feature enabled.
static void mmt4d_test(iree_uk_mmt4d_type_t type, int M0, int N0, int K0,
iree_uk_uint64_t cpu_data_field_0_bit) {
// Letting each test create its own engine makes them independent: a testcase
// succeeds or fails the same way if we isolate it or reorder it. The
// potential downside of repeating the same pseudorandom sequence is OK
// because any pseudorandom sequence should be equally good at coverage, and
// different testcases tend to use different tile shapes anyway.
iree_uk_test_random_engine_t* engine = iree_uk_test_random_engine_create();
iree_uk_mmt4d_params_t params;
memset(&params, 0, sizeof params);
params.type = type;
params.M0 = M0;
params.N0 = N0;
params.K0 = K0;
const iree_uk_uint64_t local_cpu_data_default[IREE_CPU_DATA_FIELD_COUNT] = {
0};
params.cpu_data = local_cpu_data_default;
// First try without any optional CPU feature. This matters even when the
// feature is supported by the CPU because we want to test the fallback to
// architecture-default or generic code.
test_matmuls_for_various_MNK_shapes_and_flags(params, engine);
// If this is nonzero, we are asked to test again with this CPU feature.
if (cpu_data_field_0_bit) {
const iree_uk_uint64_t local_cpu_data_with_bit[IREE_CPU_DATA_FIELD_COUNT] =
{cpu_data_field_0_bit};
params.cpu_data = local_cpu_data_with_bit;
// Check if the CPU supports the feature (otherwise, we crash).
bool supported = iree_cpu_data_field(0) & params.cpu_data[0];
char cpu_feat_str[32];
iree_uk_test_cpu_features_str(cpu_feat_str, sizeof cpu_feat_str,
params.cpu_data, 1);
if (supported) {
// Run with the optional CPU feature.
printf("Device supports CPU feature: %s\n", cpu_feat_str);
test_matmuls_for_various_MNK_shapes_and_flags(params, engine);
} else {
printf("Skipped: device does not support CPU feature: %s\n",
cpu_feat_str);
}
}
iree_uk_test_random_engine_destroy(engine);
}
#define MMT4D_TEST(type, M0, N0, K0, test_suffix, feature_bit) \
TEST(Mmt4dTest, type##_tile_##M0##x##N0##x##K0##_##test_suffix) { \
mmt4d_test(iree_uk_mmt4d_type_##type, M0, N0, K0, feature_bit); \
}
// Generic tests, not matching any particular CPU feature. This is the place to
// test weird M0, N0, K0 to ensure e.g. that we haven't unwittingly baked in a
// power-of-two assumption
MMT4D_TEST(f32f32f32, 3, 5, 7, generic, 0)
MMT4D_TEST(i8i8i32, 9, 6, 3, generic, 0)
// ARM_64 tests.
#if defined(IREE_UK_ARCH_ARM_64)
#define MMT4D_ARM_64_TEST(type, M0, N0, K0) \
MMT4D_TEST(type, M0, N0, K0, arm_64, 0)
#define MMT4D_ARM_64_TEST_WITH_CPU_FEATURE(type, M0, N0, K0, FEATURE) \
MMT4D_TEST(type, M0, N0, K0, arm_64_##FEATURE, \
IREE_CPU_DATA_FIELD_0_AARCH64_HAVE_##FEATURE)
MMT4D_ARM_64_TEST(f32f32f32, 8, 8, 1)
MMT4D_ARM_64_TEST(i8i8i32, 8, 8, 1)
MMT4D_ARM_64_TEST_WITH_CPU_FEATURE(i8i8i32, 8, 8, 4, DOTPROD)
MMT4D_ARM_64_TEST_WITH_CPU_FEATURE(i8i8i32, 8, 8, 8, I8MM)
#endif // defined(IREE_UK_ARCH_ARM_64)
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
iree_cpu_initialize(iree_allocator_system());
return RUN_ALL_TESTS();
}