blob: 7a39fcd6f071771085535e836df52abfc7664f9f [file] [log] [blame]
// Copyright lowRISC contributors.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include "sw/device/lib/base/macros.h"
#include "sw/device/lib/base/memory.h"
#include "sw/device/lib/runtime/ibex.h"
#include "sw/device/lib/runtime/log.h"
#include "sw/device/lib/testing/test_framework/check.h"
#include "sw/device/lib/testing/test_framework/ottf_main.h"
enum {
kBufLen = 1000,
kNumRuns = 10,
};
typedef struct perf_test {
// A human-readable name for this particular test, e.g. "memcpy".
const char *label;
// Setup functions for each context buffer. These function pointers must not
// be NULL. The runtime of these functions will not be measured.
void (*setup_buf1)(uint8_t *, size_t);
void (*setup_buf2)(uint8_t *, size_t);
// A function that exercises the function under test, e.g. memcpy. This
// function pointer must not be NULL. The runtime of this function will be
// measured.
void (*func)(uint8_t *buf1, uint8_t *buf2, size_t num_runs);
// The expected number of CPU cycles that `func` will take to run.
size_t expected_num_cycles;
} perf_test_t;
// Run the given `perf_test_t` and return the number of cycles it took.
static inline uint64_t perf_test_run(const perf_test_t *test, uint8_t *buf1,
uint8_t *buf2, size_t num_runs) {
CHECK(test->setup_buf1 != NULL);
CHECK(test->setup_buf2 != NULL);
CHECK(test->func != NULL);
uint64_t total_clock_cycles = 0;
for (size_t i = 0; i < num_runs; ++i) {
test->setup_buf1(buf1, kBufLen);
test->setup_buf2(buf2, kBufLen);
uint64_t start_cycles = ibex_mcycle_read();
test->func(buf1, buf2, kBufLen);
uint64_t end_cycles = ibex_mcycle_read();
// Even if the 64-bit cycle counter overflowed while running the test, the
// following subtraction would still be well-defined and correct, provided
// that the test ran in fewer than 2**64 cycles. Practically, we should
// never see it overflow. Even if it took 12 hours to run all the perf
// tests, the clock would have to run at 427 THz to overflow the counter
// (2**64 cycles / (12 * 60 * 60) seconds).
const uint64_t num_cycles = end_cycles - start_cycles;
CHECK(total_clock_cycles < UINT64_MAX - num_cycles);
total_clock_cycles += num_cycles;
}
return total_clock_cycles;
}
// Fill the buffer with arbitrary, but deterministically-selected bytes.
static inline void fill_buf_deterministic_values(uint8_t *buf, size_t len) {
uint32_t state = 42;
for (size_t i = 0; i < len; ++i) {
state = state * 17 + i;
buf[i] = (uint8_t)state;
}
}
// Zero out the buffer.
static inline void fill_buf_zeroes(uint8_t *buf, size_t len) {
memset(buf, 0, len);
}
// Zero out the buffer, but put a one at the end.
static inline void fill_buf_zeroes_then_one(uint8_t *buf, size_t len) {
fill_buf_zeroes(buf, len);
buf[len - 1] = 1;
}
// Zero out the buffer, but put a one at the beginning.
static inline void fill_buf_one_then_zeroes(uint8_t *buf, size_t len) {
fill_buf_zeroes(buf, len);
buf[0] = 1;
}
OT_NOINLINE void test_memcpy(uint8_t *buf1, uint8_t *buf2, size_t len) {
memcpy(buf1, buf2, len);
}
OT_NOINLINE void test_memset(uint8_t *buf1, uint8_t *buf2, size_t len) {
const int value = buf2[0];
memset(buf1, value, len);
}
OT_NOINLINE void test_memcmp(uint8_t *buf1, uint8_t *buf2, size_t len) {
memcmp(buf1, buf2, len);
}
OT_NOINLINE void test_memrcmp(uint8_t *buf1, uint8_t *buf2, size_t len) {
memrcmp(buf1, buf2, len);
}
OT_NOINLINE void test_memchr(uint8_t *buf1, uint8_t *buf2, size_t len) {
const uint8_t value = buf1[len - 1];
memchr(buf1, value, len);
}
OT_NOINLINE void test_memrchr(uint8_t *buf1, uint8_t *buf2, size_t len) {
const uint8_t value = buf1[0];
memrchr(buf1, value, len);
}
OTTF_DEFINE_TEST_CONFIG();
// Each value of `expected_num_cycles` was determined experimentally by
// testing on a CW310 FPGA with the following command:
//
// $ ./bazelisk.sh test --copt -O2 --test_output=all \
// //sw/device/lib/base:memory_perftest_fpga_cw310
//
// There are a handful of reasons why the expected number of cycles for this
// test might be inaccurate. Here are a few of them:
//
// (1) You've changed the definition of the test, e.g. changing the size of
// the test buffers.
// (2) You've changed the implementation of memset, memcpy, etc. and they can
// do the same job in fewer cycles.
// (3) The test was not compiled with `-O2`. The hardcoded cycle count
// expectations assume `-O2`.
// (4) The compiler has gotten smarter.
// (5) The icache gets turned on prior to test execution.
//
// If you observe the cycle count is smaller the hardcoded expectation, that's
// probably a good thing; please update the expectation!
static const perf_test_t kPerfTests[] = {
{
.label = "memcpy",
.setup_buf1 = &fill_buf_deterministic_values,
.setup_buf2 = &fill_buf_deterministic_values,
.func = &test_memcpy,
.expected_num_cycles = 33270,
},
{
.label = "memcpy_zeroes",
.setup_buf1 = &fill_buf_deterministic_values,
.setup_buf2 = &fill_buf_zeroes,
.func = &test_memcpy,
.expected_num_cycles = 33270,
},
{
.label = "memset",
.setup_buf1 = &fill_buf_zeroes,
.setup_buf2 = &fill_buf_deterministic_values,
.func = &test_memset,
.expected_num_cycles = 23200,
},
{
.label = "memset_zeroes",
.setup_buf1 = &fill_buf_zeroes,
.setup_buf2 = &fill_buf_zeroes,
.func = &test_memset,
.expected_num_cycles = 23200,
},
{
.label = "memcmp_pathological",
.setup_buf1 = &fill_buf_zeroes_then_one,
.setup_buf2 = &fill_buf_zeroes,
.func = &test_memcmp,
.expected_num_cycles = 110740,
},
{
.label = "memcmp_zeroes",
.setup_buf1 = &fill_buf_zeroes,
.setup_buf2 = &fill_buf_zeroes,
.func = &test_memcmp,
.expected_num_cycles = 110740,
},
{
.label = "memrcmp_pathological",
.setup_buf1 = &fill_buf_zeroes,
.setup_buf2 = &fill_buf_one_then_zeroes,
.func = &test_memrcmp,
.expected_num_cycles = 50740,
},
{
.label = "memrcmp_zeroes",
.setup_buf1 = &fill_buf_zeroes,
.setup_buf2 = &fill_buf_zeroes,
.func = &test_memrcmp,
.expected_num_cycles = 50850,
},
{
.label = "memchr_pathological",
.setup_buf1 = &fill_buf_deterministic_values,
.setup_buf2 = &fill_buf_zeroes,
.func = &test_memchr,
.expected_num_cycles = 7250,
},
{
.label = "memrchr_pathological",
.setup_buf1 = &fill_buf_deterministic_values,
.setup_buf2 = &fill_buf_deterministic_values,
.func = &test_memrchr,
.expected_num_cycles = 23850,
},
};
static uint8_t buf1[kBufLen];
static uint8_t buf2[kBufLen];
bool test_main(void) {
bool all_expectations_match = true;
for (size_t i = 0; i < ARRAYSIZE(kPerfTests); ++i) {
const perf_test_t *test = &kPerfTests[i];
const uint64_t num_cycles = perf_test_run(test, buf1, buf2, kNumRuns);
if (num_cycles != test->expected_num_cycles) {
all_expectations_match = false;
// Cast cycle counts to `uint32_t` before printing because `base_printf()`
// cannot print `uint64_t`.
CHECK(test->expected_num_cycles < UINT32_MAX);
CHECK(num_cycles < UINT32_MAX);
const uint32_t expected_num_cycles_u32 =
(uint32_t)test->expected_num_cycles;
const uint32_t num_cycles_u32 = (uint32_t)num_cycles;
CHECK(num_cycles < UINT32_MAX / 100);
const uint32_t percent_change =
(100 * num_cycles_u32) / expected_num_cycles_u32;
LOG_WARNING(
"%s:\n"
" Expected: %10d cycles\n"
" Actual: %10d cycles\n"
" Actual/Expected: %10d%%\n",
test->label, expected_num_cycles_u32, num_cycles_u32, percent_change);
}
}
return all_expectations_match;
}