| // Copyright 2022 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include "iree/builtins/ukernel/pack.h" |
| |
| #include <algorithm> |
| #include <cstring> |
| #include <vector> |
| |
| #include "iree/base/api.h" |
| #include "iree/base/internal/cpu.h" |
| #include "iree/builtins/ukernel/tools/ukernel_test_utils.h" |
| #include "iree/testing/gtest.h" |
| #include "iree/testing/status_matchers.h" |
| |
| static void iree_pack_reference(const iree_uk_pack_params_t& params) { |
| // For now, the input and output element types are always the same. |
| iree_uk_type_t elem_type = iree_uk_pack_in_type(params.type); |
| iree_uk_ssize_t elem_size = iree_uk_type_size(elem_type); |
| iree_uk_ssize_t outer_size0 = params.out_size0; |
| iree_uk_ssize_t outer_size1 = params.out_size1; |
| iree_uk_ssize_t tile_size0 = params.out_size2; |
| iree_uk_ssize_t tile_size1 = params.out_size3; |
| iree_uk_ssize_t out_stride_l0 = params.out_stride0; |
| iree_uk_ssize_t out_stride_l1 = params.out_size3 * params.out_size2; |
| iree_uk_ssize_t out_stride_l2 = params.out_size3; |
| iree_uk_ssize_t out_stride_l3 = 1; |
| if (params.flags & IREE_UK_FLAG_PACK_TRANSPOSE_OUTER) { |
| std::swap(outer_size0, outer_size1); |
| std::swap(out_stride_l0, out_stride_l1); |
| } |
| if (params.flags & IREE_UK_FLAG_PACK_TRANSPOSE_INNER) { |
| std::swap(tile_size0, tile_size1); |
| std::swap(out_stride_l2, out_stride_l3); |
| } |
| assert(outer_size0 * tile_size0 >= params.in_size0); |
| assert(outer_size1 * tile_size1 >= params.in_size1); |
| assert((outer_size0 - 1) * tile_size0 < params.in_size0); |
| assert((outer_size1 - 1) * tile_size1 < params.in_size1); |
| for (iree_uk_ssize_t outer_i0 = 0; outer_i0 < outer_size0; ++outer_i0) { |
| for (iree_uk_ssize_t outer_i1 = 0; outer_i1 < outer_size1; ++outer_i1) { |
| for (iree_uk_ssize_t tile_i0 = 0; tile_i0 < tile_size0; ++tile_i0) { |
| for (iree_uk_ssize_t tile_i1 = 0; tile_i1 < tile_size1; ++tile_i1) { |
| iree_uk_ssize_t out_offset = |
| outer_i0 * out_stride_l0 + tile_i0 * out_stride_l2 + |
| outer_i1 * out_stride_l1 + tile_i1 * out_stride_l3; |
| iree_uk_ssize_t i0 = outer_i0 * tile_size0 + tile_i0; |
| iree_uk_ssize_t i1 = outer_i1 * tile_size1 + tile_i1; |
| char* out_ptr = ((char*)params.out_buffer) + out_offset * elem_size; |
| if (i0 >= params.in_size0 || i1 >= params.in_size1) { |
| memcpy(out_ptr, params.padding_value, elem_size); |
| } else { |
| iree_uk_ssize_t in_offset = i1 + i0 * params.in_stride0; |
| const char* in_ptr = |
| ((char*)params.in_buffer) + in_offset * elem_size; |
| memcpy(out_ptr, in_ptr, elem_size); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| static void test_one_pack_using_given_input( |
| const iree_uk_pack_params_t& shared_params, |
| iree_uk_test_random_engine_t* engine) { |
| assert(!shared_params.out_buffer); |
| |
| iree_uk_pack_params_t reference_params; |
| memcpy(&reference_params, &shared_params, sizeof shared_params); |
| iree_uk_type_t out_type = iree_uk_pack_out_type(shared_params.type); |
| iree_uk_ssize_t out_buffer_size = iree_uk_test_2d_buffer_length( |
| out_type, shared_params.out_size0, shared_params.out_stride0); |
| reference_params.out_buffer = malloc(out_buffer_size); |
| iree_uk_test_write_random_buffer(reference_params.out_buffer, out_buffer_size, |
| out_type, engine); |
| |
| iree_uk_pack_params_t actual_params; |
| memcpy(&actual_params, &shared_params, sizeof shared_params); |
| actual_params.out_buffer = malloc(out_buffer_size); |
| iree_uk_test_write_random_buffer(actual_params.out_buffer, out_buffer_size, |
| out_type, engine); |
| |
| iree_pack_reference(reference_params); |
| iree_uk_status_t status = iree_uk_pack(&actual_params); |
| if (status != iree_uk_status_ok) { |
| fprintf(stderr, "FATAL: iree_uk_pack failed: %s\n", |
| iree_uk_status_message(status)); |
| iree_abort(); |
| } |
| |
| // For now we use exact comparisons, even for float, even though the reference |
| // code accumulates in a different order compared to the actual code. This |
| // relies on picking input test matrix elements so that all intermediate |
| // values are exactly representable - i.e. small integer numerators. This |
| // become problematic when we do float16. See the comment at the top of this |
| // file explaining how we refrain from letting this grow into a 1000-line-long |
| // fully-featured test. |
| if (memcmp(actual_params.out_buffer, reference_params.out_buffer, |
| out_buffer_size)) { |
| const auto& p = actual_params; |
| fprintf(stderr, "pack test failure with the following params:\n"); |
| char types_str[32]; |
| iree_uk_test_type_pair_str(types_str, sizeof types_str, p.type); |
| fprintf(stderr, " types: %s\n", types_str); |
| fprintf(stderr, " flags: transpose_inner=%d, transpose_outer=%d\n", |
| (bool)(p.flags & IREE_UK_FLAG_PACK_TRANSPOSE_INNER), |
| (bool)(p.flags & IREE_UK_FLAG_PACK_TRANSPOSE_OUTER)); |
| fprintf(stderr, " input shape: %dx%d\n", (int)p.in_size0, (int)p.in_size1); |
| fprintf(stderr, " output shape: %dx%dx%dx%d\n", (int)p.out_size0, |
| (int)p.out_size1, (int)p.out_size2, (int)p.out_size3); |
| fprintf(stderr, " input stride: %d\n", (int)p.in_stride0); |
| fprintf(stderr, " output stride: %d\n", (int)p.out_stride0); |
| // Don't even try to pretty-print matrices. See the comment at the top of |
| // this file. Don't try to use GTest primitives to show expected vs actual |
| // since that would require dispatching to type-specific code paths. |
| // Also, at this point it's easy for the user to rerun this test |
| // in a debugger and manually inspect values. |
| // |
| // We want fatal here - that is what the user running this in a debugger |
| // wants us to do, so they can inspect values while they exist in memory. |
| // What's the GTest-sanctioned fatal error? GTEST_FAIL() has a comment that |
| // says that it's fatal, but that's a lie at least here on Android. |
| iree_abort(); |
| } |
| |
| free(reference_params.out_buffer); |
| free(actual_params.out_buffer); |
| } |
| |
| static void test_one_pack_creating_input_for_given_shape( |
| const iree_uk_pack_params_t& shared_params, |
| iree_uk_test_random_engine_t* engine) { |
| iree_uk_pack_params_t params; |
| memcpy(¶ms, &shared_params, sizeof params); |
| assert(!params.in_buffer); |
| assert(!params.out_buffer); |
| assert(!params.in_stride0); |
| assert(!params.out_stride0); |
| // Populate strides first - we need them below to compute buffer lengths. |
| // Randomly make strides either tight or not to exercise all cases. |
| params.in_stride0 = |
| params.in_size1 + iree_uk_test_random_engine_get_0_1(engine); |
| params.out_stride0 = params.out_size1 * params.out_size2 * params.out_size3; |
| iree_uk_type_t in_type = iree_uk_pack_in_type(params.type); |
| iree_uk_ssize_t in_buffer_size = iree_uk_test_2d_buffer_length( |
| in_type, params.in_size0, params.in_stride0); |
| void* in_buffer = malloc(in_buffer_size); |
| iree_uk_test_write_random_buffer(in_buffer, in_buffer_size, in_type, engine); |
| params.in_buffer = in_buffer; |
| test_one_pack_using_given_input(params, engine); |
| free(in_buffer); |
| } |
| |
| static void pack_test_for_various_tile_shapes_and_flags( |
| iree_uk_pack_type_t type, int tile_size0, int tile_size1, |
| const iree_uk_uint64_t* cpu_data, iree_uk_test_random_engine_t* engine) { |
| struct outer_shape_t { |
| int size0, size1; |
| }; |
| std::vector<outer_shape_t> outer_shapes{ |
| // Degenerate cases. Vacuous. |
| {0, 1}, |
| {1, 0}, |
| // Non-degenerate cases. |
| {1, 1}, |
| {2, 2}, |
| {3, 2}, |
| {8, 8}, |
| {11, 13}, |
| {123, 45}, |
| }; |
| for (const auto& outer_shape : outer_shapes) { |
| for (bool transpose_inner : {false, true}) { |
| for (bool transpose_outer : {false, true}) { |
| iree_uk_pack_params_t params = {}; |
| params.type = type; |
| params.cpu_data = cpu_data; |
| iree_uk_ssize_t out_size0 = outer_shape.size0; |
| iree_uk_ssize_t out_size1 = outer_shape.size1; |
| iree_uk_ssize_t out_size2 = tile_size0; |
| iree_uk_ssize_t out_size3 = tile_size1; |
| params.out_size0 = out_size0; |
| params.out_size1 = out_size1; |
| params.out_size2 = out_size2; |
| params.out_size3 = out_size3; |
| params.flags = 0; |
| if (transpose_outer) { |
| params.flags |= IREE_UK_FLAG_PACK_TRANSPOSE_OUTER; |
| std::swap(out_size0, out_size1); |
| } |
| if (transpose_inner) { |
| params.flags |= IREE_UK_FLAG_PACK_TRANSPOSE_INNER; |
| std::swap(out_size2, out_size3); |
| } |
| iree_uk_ssize_t pad_size0 = |
| iree_uk_test_random_engine_get_0_65535(engine) % out_size2; |
| iree_uk_ssize_t pad_size1 = |
| iree_uk_test_random_engine_get_0_65535(engine) % out_size3; |
| params.in_size0 = |
| std::max<iree_uk_ssize_t>(0, out_size0 * out_size2 - pad_size0); |
| params.in_size1 = |
| std::max<iree_uk_ssize_t>(0, out_size1 * out_size3 - pad_size1); |
| iree_uk_type_t out_type = iree_uk_pack_out_type(type); |
| int out_elem_size = iree_uk_type_size(out_type); |
| void* padding_value_buffer = malloc(out_elem_size); |
| iree_uk_test_write_random_buffer(padding_value_buffer, out_elem_size, |
| out_type, engine); |
| params.padding_value = padding_value_buffer; |
| test_one_pack_creating_input_for_given_shape(params, engine); |
| free(padding_value_buffer); |
| } |
| } |
| } |
| } |
| |
| static void pack_test(iree_uk_pack_type_t type, int tile_size0, int tile_size1, |
| iree_uk_uint64_t cpu_data_field_0_bit) { |
| const iree_uk_uint64_t local_cpu_data_default[IREE_CPU_DATA_FIELD_COUNT] = { |
| 0}; |
| iree_uk_test_random_engine_t* engine = iree_uk_test_random_engine_create(); |
| // First try without any optional CPU feature. This matters even when the |
| // feature is supported by the CPU because we want to test the fallback to |
| // architecture-default or generic code. |
| pack_test_for_various_tile_shapes_and_flags(type, tile_size0, tile_size1, |
| local_cpu_data_default, engine); |
| // If this is nonzero, we are asked to test again with this CPU feature. |
| if (cpu_data_field_0_bit) { |
| const iree_uk_uint64_t local_cpu_data_with_bit[IREE_CPU_DATA_FIELD_COUNT] = |
| {cpu_data_field_0_bit}; |
| // Check if the CPU supports the feature (otherwise, we crash). |
| bool supported = iree_cpu_data_field(0) & cpu_data_field_0_bit; |
| char cpu_feat_str[32]; |
| iree_uk_test_cpu_features_str(cpu_feat_str, sizeof cpu_feat_str, |
| local_cpu_data_with_bit, 1); |
| if (supported) { |
| // Run with the optional CPU feature. |
| printf("Device supports CPU feature: %s\n", cpu_feat_str); |
| pack_test_for_various_tile_shapes_and_flags( |
| type, tile_size0, tile_size1, local_cpu_data_with_bit, engine); |
| } else { |
| printf("Skipped: device does not support CPU feature: %s\n", |
| cpu_feat_str); |
| } |
| } |
| |
| iree_uk_test_random_engine_destroy(engine); |
| } |
| |
| #define PACK_TEST(type, tile_size0, tile_size1, test_suffix, feature_bit) \ |
| TEST(PackTest, type##_tile_##tile_size0##x##tile_size1##_##test_suffix) { \ |
| pack_test(iree_uk_pack_type_##type, tile_size0, tile_size1, feature_bit); \ |
| } |
| |
| // Generic tests, not matching any particular CPU feature. This is the place to |
| // test weird tile shapes to ensure e.g. that we haven't unwittingly baked in a |
| // power-of-two assumption |
| PACK_TEST(f32f32, 3, 5, generic, 0) |
| PACK_TEST(i8i8, 4, 2, generic, 0) |
| PACK_TEST(i32i32, 3, 4, generic, 0) |
| |
| // ARM_64 tests. |
| #if defined(IREE_UK_ARCH_ARM_64) |
| |
| #define PACK_ARM_64_TEST(type, tile_size0, tile_size1) \ |
| PACK_TEST(type, tile_size0, tile_size1, arm_64, 0) |
| |
| #define PACK_ARM_64_TEST_WITH_CPU_FEATURE(type, tile_size0, tile_size1, \ |
| FEATURE) \ |
| PACK_TEST(type, tile_size0, tile_size1, arm_64_##FEATURE, \ |
| IREE_CPU_DATA_FIELD_0_AARCH64_HAVE_##FEATURE) |
| |
| PACK_ARM_64_TEST(f32f32, 8, 1) |
| PACK_ARM_64_TEST(i8i8, 8, 1) |
| PACK_ARM_64_TEST_WITH_CPU_FEATURE(i8i8, 8, 4, DOTPROD) |
| PACK_ARM_64_TEST_WITH_CPU_FEATURE(i8i8, 8, 8, I8MM) |
| |
| #endif // defined(IREE_UK_ARCH_ARM_64) |
| |
| int main(int argc, char** argv) { |
| ::testing::InitGoogleTest(&argc, argv); |
| iree_cpu_initialize(iree_allocator_system()); |
| return RUN_ALL_TESTS(); |
| } |