blob: 120618551db042058651ce40cc79a2b3e596357d [file] [log] [blame]
/*
* Copyright 2017, Data61
* Commonwealth Scientific and Industrial Research Organisation (CSIRO)
* ABN 41 687 119 230.
*
* This software may be distributed and modified according to the terms of
* the BSD 2-Clause license. Note that NO WARRANTY is provided.
* See "LICENSE_BSD2.txt" for details.
*
* @TAG(DATA61_BSD)
*/
#pragma once
#include <autoconf.h>
#include <stdint.h>
#include <utils/util.h>
//function attributes
//ultra-short, time-sensitive functions
#define FASTFN inline __attribute__((always_inline))
//short, cache-sensitive functions (note: short means one cache line)
#define CACHESENSFN __attribute__((noinline, aligned(64)))
//functions that will be passed to seL4_DebugRun() -- fast, but obviously not inlined
#define KERNELFN __attribute__((noinline, flatten))
#define IN_TX_BIT BIT(0)
#define IN_TXCP_BIT BIT(1)
#include "events.h"
//CPUID leaf node numbers
enum {
IA32_CPUID_LEAF_BASIC = 0,
IA32_CPUID_LEAF_MODEL = 1,
IA32_CPUID_LEAF_PMC = 0xA,
IA32_CPUID_LEAF_EXTENDED = 0x80000000,
};
//CPUID.0 "GenuineIntel"
#define IA32_CPUID_BASIC_MAGIC_EBX 0x756E6547
#define IA32_CPUID_BASIC_MAGIC_ECX 0x6C65746E
#define IA32_CPUID_BASIC_MAGIC_EDX 0x49656E69
//CPUID.1 Family and Model ID macros and type.
#define FAMILY(x) ( (x).family == 0xF ? ( (x).ex_family + (x).family) : (x).family )
#define MODEL(x) ( ((x).family == 0xF || (x).family == 0x6) ? (((x).ex_model << 4) + (x).model ) : (x).model )
#define IA32_CPUID_FAMILY_P6 0x6
typedef union {
struct {
seL4_Word stepping : 4;
seL4_Word model : 4;
seL4_Word family : 4;
seL4_Word type : 2;
seL4_Word reserved1 : 2;
seL4_Word ex_model : 4;
seL4_Word ex_family : 8;
seL4_Word reserved2 : 4;
};
uint32_t raw;
} ia32_cpuid_model_info_t;
//CPUID.PMC Performance-monitoring macros and types
typedef union {
struct {
uint8_t pmc_version_id;
uint8_t gp_pmc_count_per_core;
uint8_t gp_pmc_bit_width;
uint8_t ebx_bit_vector_length;
};
uint32_t raw;
} ia32_cpuid_leaf_pmc_eax_t;
//Control-register constants
#define IA32_CR4_PCE 8
//PMC MSRs
#define IA32_MSR_PMC_PERFEVTSEL_BASE 0x186
#define IA32_MSR_PMC_PERFEVTCNT_BASE 0x0C1
typedef union {
struct {
uint16_t event;
union {
struct {
uint8_t USR : 1;
uint8_t OS : 1;
uint8_t E : 1;
uint8_t PC : 1;
uint8_t INT : 1;
uint8_t res : 1;
uint8_t EN : 1;
uint8_t INV : 1;
};
uint8_t flags;
};
uint8_t cmask;
};
uint32_t raw;
} ia32_pmc_perfevtsel_t;
//Convenient execution of CPUID instruction. The first version isn't volatile, so is for querying the processor; the second version just serialises.
//This looks slow, but gcc inlining is smart enough to optimise away all the memory references, and takes unused information into account.
static FASTFN void sel4bench_private_cpuid(uint32_t leaf, uint32_t subleaf, uint32_t * eax, uint32_t * ebx, uint32_t * ecx, uint32_t * edx)
{
asm (
"cpuid"
: "=a"(*eax) /* output eax */
, "=b"(*ebx) /* output ebx */
, "=c"(*ecx) /* output ecx */
, "=d"(*edx) /* output edx */
: "a" (leaf) /* input query leaf */
, "c" (subleaf) /* input query subleaf */
);
}
static FASTFN void sel4bench_private_cpuid_serial()
{
//set leaf and subleaf to 0 for predictability
uint32_t eax = 0;
uint32_t ecx = 0;
asm volatile (
"cpuid"
: "+a"(eax) /* eax = 0 and gets clobbered */
, "+c"(ecx) /* ecx = 0 and gets clobbered */
: /* no other inputs to this version */
: "%ebx" /* clobber ebx */
, "%edx" /* clobber edx */
, "cc" /* clobber condition code */
);
}
static FASTFN void sel4bench_private_lfence()
{
asm volatile("lfence");
}
static FASTFN uint64_t sel4bench_private_rdtsc()
{
uint32_t lo, hi;
asm volatile (
"rdtsc"
: "=a"(lo), "=d"(hi)
);
return (((uint64_t)hi << 32ull) | (uint64_t)lo);
}
static FASTFN uint64_t sel4bench_private_rdpmc(uint32_t counter)
{
uint32_t hi, lo;
asm volatile (
"rdpmc"
: "=a"(lo), "=d"(hi)
: "c"(counter)
);
return (((uint64_t)hi << 32ull) | (uint64_t)lo);
}
//Serialization instruction for before and after reading PMCs
//See comment in arch/sel4bench.h for details.
#ifdef SEL4BENCH_STRICT_PMC_SERIALIZATION
#define sel4bench_private_serialize_pmc sel4bench_private_cpuid_serial
#else //SEL4BENCH_STRICT_PMC_SERIALIZATION
#define sel4bench_private_serialize_pmc sel4bench_private_lfence
#endif //SEL4BENCH_STRICT_PMC_SERIALIZATION
/* Hide these definitions if using kernel exported PMC to prevent warnings */
#ifndef CONFIG_EXPORT_PMC_USER
//enable user-level pmc access
static KERNELFN void sel4bench_private_enable_user_pmc(void* arg)
{
#ifdef CONFIG_ARCH_X86_64
uint64_t dummy;
asm volatile (
"movq %%cr4, %0;"
"orq %[pce], %0;"
"movq %0, %%cr4;"
: "=r" (dummy)
: [pce] "i" BIT(IA32_CR4_PCE)
: "cc"
);
#else
uint32_t dummy;
asm volatile (
"movl %%cr4, %0;" /* read CR4 */
"orl %[pce], %0;" /* enable PCE flag */
"movl %0, %%cr4;" /* write CR4 */
: "=r" (dummy) /* fake output to ask GCC for a register */
: [pce] "i" BIT(IA32_CR4_PCE) /* input PCE flag */
: "cc" /* clobber condition code */
);
#endif
}
//disable user-level pmc access
static KERNELFN void sel4bench_private_disable_user_pmc(void* arg)
{
#ifdef CONFIG_ARCH_X86_64
uint64_t dummy;
asm volatile (
"movq %%cr4, %0;"
"andq %[pce], %0;"
"movq %0, %%cr4;"
: "=r" (dummy)
: [pce] "i" (~BIT(IA32_CR4_PCE))
: "cc"
);
#else
uint32_t dummy;
asm volatile (
"movl %%cr4, %0;" /* read CR4 */
"andl %[pce], %0;" /* enable PCE flag */
"movl %0, %%cr4;" /* write CR4 */
: "=r" (dummy) /* fake output to ask GCC for a register */
: [pce] "i" (~BIT(IA32_CR4_PCE)) /* input PCE flag */
: "cc" /* clobber condition code */
);
#endif
}
#endif
#ifndef CONFIG_KERNEL_X86_DANGEROUS_MSR
//read an MSR
static KERNELFN void sel4bench_private_rdmsr(void* arg)
{
uint32_t* msr = (uint32_t*)arg;
asm volatile (
"rdmsr"
: "=a" (msr[1]) /* output low */
, "=d" (msr[2]) /* output high */
: "c" (msr[0]) /* input MSR index */
);
}
//write an MSR
static KERNELFN void sel4bench_private_wrmsr(void* arg)
{
uint32_t* msr = (uint32_t*)arg;
asm volatile (
"wrmsr"
: /* no output */
: "a" (msr[1]) /* input low */
, "d" (msr[2]) /* input high */
, "c" (msr[0]) /* input MSR index */
);
}
#endif
//generic event tables for lookup fn below
//they use direct event numbers, rather than the constants in events.h, because it's smaller
static seL4_Word SEL4BENCH_IA32_WESTMERE_EVENTS[5] = {
0x0280, //CACHE_L1I_MISS
0x0151, //CACHE_L1D_MISS, must use counter 0 or 1
0x20C8, //TLB_L1I_MISS
0x80CB, //TLB_L1D_MISS
0x5FCB //SEL4BENCH_IA32_WESTMERE_EVENT_CACHE_|{L1D_HIT,L2_HIT,L3P_HIT,L3_HIT,L3_MISS,LFB_HIT}_R
};
static seL4_Word SEL4BENCH_IA32_NEHALEM_EVENTS[5] = {
0x0280, //CACHE_L1I_MISS
0x0151, //CACHE_L1D_MISS, must use counter 0 or 1
0x20C8, //TLB_L1I_MISS
0x80CB, //TLB_L1D_MISS
0x5FCB //SEL4BENCH_IA32_NEHALEM_EVENT_CACHE_|{L1D_HIT,L2_HIT,L3P_HIT,L3_HIT,L3_MISS,LFB_HIT}_R
};
static seL4_Word SEL4BENCH_IA32_CORE2_EVENTS[5] = {
0x0081, //CACHE_L1I_MISS
0x01CB, //CACHE_L1D_MISS, must use counter 0
0x00C9, //TLB_L1I_MISS
0x10CB, //TLB_L1D_MISS, must use counter 0
0x03C0 //SEL4BENCH_IA32_CORE2_EVENT_RETIRE_MEMORY_|{READ,WRITE}
};
static seL4_Word SEL4BENCH_IA32_CORE_EVENTS[5] = {
0x0081, //CACHE_L1I_MISS
0x0000, //CACHE_L1D_MISS, not available on CORE
0x0085, //TLB_L1I_MISS
0x0049, //TLB_L1D_MISS
0x0143 //MEMORY_ACCESS
};
static seL4_Word SEL4BENCH_IA32_P6_EVENTS[5] = {
0x0081, //CACHE_L1I_MISS
0x0045, //CACHE_L1D_MISS
0x0085, //TLB_L1I_MISS
0x0000, //TLB_L1D_MISS, not available on P6
0x0043 //MEMORY_ACCESS
};
static seL4_Word SEL4BENCH_IA32_HASWELL_EVENTS[5] = {
0x0280, //CACHE_L1I_MISS
0x0151, //CACHE_L1D_MISS, must use counter 0 or 1
0x0085, //TLB_L1I_MISS
0x0049, //TLB_L1D_MISS
0x412E //LLC_MISS
};
static seL4_Word SEL4BENCH_IA32_BROADWELL_EVENTS[5] = {
0x0280, //ICACHE.MISSES
0x0151, //L1D.REPLACEMENT
0x2185, //ITLB_MISSES.MISS_CAUSES_A_WALK | ITLB_MISSES.STLB_HIT_4K
0x0000, //No combined load/store dTLB miss counter available
0x412E //LONGEST_LAT_CACHE.MISS
};
static seL4_Word SEL4BENCH_IA32_SKYLAKE_EVENTS[5] = {
0x0000, //No combined tag/data iCache miss counter available
0x0151, //L1D.REPLACEMENT
0x2185, //ITLB_MISSES.MISS_CAUSES_A_WALK | ITLB_MISSES.STLB_HIT
0x0000, //No combined load/store dTLB miss counter available
0x412E //LONGEST_LAT_CACHE.MISS
};
static FASTFN seL4_Word sel4bench_private_lookup_event(event_id_t event)
{
if ((SEL4BENCH_EVENT_GENERIC_MASK & event) == SEL4BENCH_EVENT_GENERIC_MASK) {
uint32_t dummy = 0;
ia32_cpuid_model_info_t model_info = { .raw = 0 };
sel4bench_private_cpuid(IA32_CPUID_LEAF_MODEL, 0, &model_info.raw, &dummy, &dummy, &dummy);
//we should be a P6
assert(FAMILY(model_info) == IA32_CPUID_FAMILY_P6);
uint8_t model = MODEL(model_info);
event = event & ~SEL4BENCH_EVENT_GENERIC_MASK;
//Using the model summary on http://www.sandpile.org/x86/cpuid.htm#level_0000_0001h
//Let's hope it's accurate...
//We are also pretending Atoms don't exist
//P3 or PM
if (model <= 0x0D || model == 0x15) {
return SEL4BENCH_IA32_P6_EVENTS[event];
}
switch (model) {
//CORE
case 0x0E:
return SEL4BENCH_IA32_CORE_EVENTS[event];
//CORE2
case 0x0F:
case 0x16:
case 0x17:
case 0x1D:
return SEL4BENCH_IA32_CORE2_EVENTS[event];
//NEHALEM
case 0x1A:
case 0x1E:
case 0x1F:
case 0x2E:
return SEL4BENCH_IA32_NEHALEM_EVENTS[event];
//WESTMERE
case 0x25:
case 0x2C:
case 0x2F:
return SEL4BENCH_IA32_WESTMERE_EVENTS[event];
//SANDY BRIDGE
case 0x2A:
case 0x2D:
return 0x0000; //TODO
//IVY BRIDGE
case 0x3A:
case 0x3E:
return 0x0000; //TODO
//HASWELL
case 0x3C:
case 0x3F:
case 0x45:
case 0x46:
return SEL4BENCH_IA32_HASWELL_EVENTS[event];
//BROADWELL
case 0x3D:
case 0x47:
case 0x4F:
case 0x56:
return SEL4BENCH_IA32_BROADWELL_EVENTS[event];
//SKYLAKE
case 0x4E:
case 0x5E:
return SEL4BENCH_IA32_SKYLAKE_EVENTS[event];
//Unknown
default:
return 0x0000;
}
} else {
return event;
}
}