| // Copyright 2022 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include <stdio.h> |
| |
| #include "iree/base/api.h" |
| #include "iree/base/internal/flags.h" |
| #include "iree/builtins/ukernel/api.h" |
| #include "iree/builtins/ukernel/tools/benchmark.h" |
| #include "iree/builtins/ukernel/tools/memcpy_benchmark.h" |
| #include "iree/builtins/ukernel/tools/util.h" |
| #include "iree/builtins/ukernel/unpack_internal.h" |
| |
| IREE_FLAG( |
| int64_t, working_set_size, 10000, |
| "Number of bytes to be traversed by the benchmark workload (input and " |
| "output buffers together). Matrix shapes are computed accordingly."); |
| IREE_FLAG( |
| int32_t, padding_size, 0, |
| "Padding size (same value used for both dimensions, 0 means no padding)"); |
| |
| static iree_status_t iree_uk_benchmark_unpack( |
| const iree_benchmark_def_t* benchmark_def, |
| iree_benchmark_state_t* benchmark_state) { |
| const iree_uk_benchmark_user_data_t* user_data = benchmark_def->user_data; |
| const iree_uk_unpack_params_t* src_params = |
| iree_uk_benchmark_params(user_data); |
| iree_uk_unpack_params_t params; |
| memcpy(¶ms, src_params, sizeof params); |
| params.cpu_data = iree_uk_benchmark_cpu_data(user_data); |
| iree_uk_unpack_type_t unpack_type = iree_uk_unpack_type(params.flags); |
| iree_uk_type_t in_type = iree_uk_unpack_in_type(unpack_type); |
| iree_uk_type_t out_type = iree_uk_unpack_out_type(unpack_type); |
| iree_uk_index_t in_type_size = iree_uk_type_size(in_type); |
| iree_uk_index_t out_type_size = iree_uk_type_size(out_type); |
| |
| // The inner dims 2, 3 are given to us as part of the benchmark user_data. |
| // The outer dims 0, 1 are to be determined based on FLAG_working_set_size. |
| iree_uk_index_t in_size0 = 1; |
| iree_uk_index_t in_size1 = 1; |
| iree_uk_index_t in_size2 = params.in_size2; |
| iree_uk_index_t in_size3 = params.in_size3; |
| int target_matrix_size_in_elems = |
| FLAG_working_set_size / (in_type_size + out_type_size); |
| int target_product_of_outer_sizes_0_1 = |
| target_matrix_size_in_elems / (in_size2 * in_size3); |
| while (target_product_of_outer_sizes_0_1 >= 4) { |
| target_product_of_outer_sizes_0_1 /= 4; |
| in_size0 *= 2; |
| in_size1 *= 2; |
| } |
| in_size1 *= target_product_of_outer_sizes_0_1; |
| params.in_size0 = in_size0; |
| params.in_size1 = in_size1; |
| if (params.flags & IREE_UK_FLAG_UNPACK_TRANSPOSE_OUTER) { |
| iree_uk_index_swap(&in_size0, &in_size1); |
| } |
| if (params.flags & IREE_UK_FLAG_UNPACK_TRANSPOSE_INNER) { |
| iree_uk_index_swap(&in_size2, &in_size3); |
| } |
| params.out_size0 = iree_max(0, in_size0 * in_size2 - FLAG_padding_size); |
| params.out_size1 = iree_max(0, in_size1 * in_size3 - FLAG_padding_size); |
| params.out_stride0 = params.out_size1; |
| params.in_stride0 = params.in_size1 * params.in_size2 * params.in_size3; |
| iree_uk_index_t in_buffer_size = |
| iree_uk_2d_buffer_length(in_type, params.in_size0, params.in_stride0); |
| iree_uk_index_t out_buffer_size = |
| iree_uk_2d_buffer_length(out_type, params.out_size0, params.out_stride0); |
| void* in_buffer = malloc(in_buffer_size); |
| void* out_buffer = malloc(out_buffer_size); |
| iree_uk_random_engine_t* engine = iree_uk_benchmark_random_engine(user_data); |
| // It's just about plausible that on some platform, for some number type, |
| // performance might be different on zero buffers vs random buffers. But it |
| // shouldn't matter that we recreate the random engine every time, getting |
| // the same random values again. |
| iree_uk_write_random_buffer(in_buffer, in_buffer_size, in_type, engine); |
| iree_uk_write_random_buffer(out_buffer, out_buffer_size, out_type, engine); |
| params.in_buffer = in_buffer; |
| params.out_buffer = out_buffer; |
| int64_t total_iterations = 0; |
| int64_t batch_count = 1; |
| while (iree_benchmark_keep_running(benchmark_state, batch_count)) { |
| for (int i = 0; i < batch_count; ++i) { |
| iree_uk_unpack(¶ms); |
| } |
| total_iterations += batch_count; |
| batch_count *= 2; |
| } |
| // Report bytes per second, so that can be easily compared to known memory |
| // system performance metrics (e.g. RAM bandwidth, to tell whether this is |
| // memory-bound). |
| iree_benchmark_set_bytes_processed(benchmark_state, |
| total_iterations * out_buffer_size); |
| free(in_buffer); |
| free(out_buffer); |
| return iree_ok_status(); |
| } |
| |
| static void iree_uk_benchmark_register_unpack(iree_uk_uint32_t flags, |
| int tile_size0, int tile_size1, |
| const char* cpu_features) { |
| char type_str[32]; |
| iree_uk_unpack_type_t unpack_type = iree_uk_unpack_type(flags); |
| iree_uk_type_pair_str(type_str, sizeof type_str, unpack_type); |
| iree_uk_unpack_params_t params = {.in_size2 = tile_size0, |
| .in_size3 = tile_size1}; |
| typedef struct unpack_variant_t { |
| const char* label; |
| iree_uk_uint32_t flags; |
| } unpack_variant_t; |
| const unpack_variant_t variants[] = { |
| {"trnone", 0}, |
| {"trinner", IREE_UK_FLAG_UNPACK_TRANSPOSE_INNER}, |
| {"trouter", IREE_UK_FLAG_UNPACK_TRANSPOSE_OUTER}, |
| {"trboth", IREE_UK_FLAG_UNPACK_TRANSPOSE_INNER | |
| IREE_UK_FLAG_UNPACK_TRANSPOSE_OUTER}, |
| }; |
| for (int i = 0; i < IREE_ARRAYSIZE(variants); ++i) { |
| unpack_variant_t variant = variants[i]; |
| char name[128]; |
| snprintf(name, sizeof name, "unpack_%s_tile_%dx%d_%s_wss_%" PRIi64, |
| type_str, tile_size0, tile_size1, variant.label, |
| FLAG_working_set_size); |
| params.flags = flags | variant.flags; |
| iree_uk_benchmark_register(name, iree_uk_benchmark_unpack, ¶ms, |
| sizeof params, cpu_features); |
| } |
| } |
| |
| int main(int argc, char** argv) { |
| iree_flags_set_usage("unpack_benchmark", ""); |
| |
| iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK, &argc, &argv); |
| iree_uk_benchmark_initialize(&argc, argv); |
| |
| // The memcpy benchmark provides a useful comparison point, as pack is fairly |
| // close to memory-bound. |
| iree_uk_benchmark_register_memcpy(FLAG_working_set_size); |
| |
| #if defined(IREE_ARCH_ARM_64) |
| iree_uk_benchmark_register_unpack(IREE_UK_FLAG_UNPACK_TYPE_F32F32, 8, 8, ""); |
| iree_uk_benchmark_register_unpack(IREE_UK_FLAG_UNPACK_TYPE_I32I32, 8, 8, ""); |
| #elif defined(IREE_ARCH_X86_64) |
| iree_uk_benchmark_register_unpack(IREE_UK_FLAG_UNPACK_TYPE_F32F32, 8, 8, |
| "avx2_fma"); |
| iree_uk_benchmark_register_unpack(IREE_UK_FLAG_UNPACK_TYPE_I32I32, 8, 8, |
| "avx2_fma"); |
| iree_uk_benchmark_register_unpack(IREE_UK_FLAG_UNPACK_TYPE_F32F32, 16, 16, |
| "avx512_base"); |
| iree_uk_benchmark_register_unpack(IREE_UK_FLAG_UNPACK_TYPE_I32I32, 16, 16, |
| "avx512_base"); |
| #else // defined(IREE_ARCH_ARM_64) |
| // Architectures on which we do not have any optimized ukernel code. |
| // Benchmark some arbitrary tile shape. |
| iree_uk_benchmark_register_unpack(IREE_UK_FLAG_UNPACK_TYPE_F32F32, 8, 8, ""); |
| iree_uk_benchmark_register_unpack(IREE_UK_FLAG_UNPACK_TYPE_I32I32, 8, 8, ""); |
| #endif // defined(IREE_ARCH_ARM_64) |
| |
| iree_uk_benchmark_run_and_cleanup(); |
| } |