libsel4bench/arch_include/x86/sel4bench/arch/sel4bench.h - 3p/sel4/sel4_libs - Git at Google

 /*
  * Copyright 2014, NICTA
  *
  * This software may be distributed and modified according to the terms of
  * the BSD 2-Clause license. Note that NO WARRANTY is provided.
  * See "LICENSE_BSD2.txt" for details.
  *
  * @TAG(NICTA_BSD)
  */

 #ifndef __ARCH_SEL4BENCH_H__
 #define __ARCH_SEL4BENCH_H__

 #include <assert.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdbool.h>

 #define SEL4BENCH_READ_CCNT(var) do { \
     uint32_t low, high; \
     asm volatile( \
         "movl $0, %%eax \n" \
         "movl $0, %%ecx \n" \
         "cpuid \n" \
         "rdtsc \n" \
         "movl %%edx, %0 \n" \
         "movl %%eax, %1 \n" \
         "movl $0, %%eax \n" \
         "movl $0, %%ecx \n" \
         "cpuid \n" \
         : \
          "=r"(high), \
          "=r"(low) \
         : \
         : "eax", "ebx", "ecx", "edx" \
     ); \
     (var) = (((uint64_t)high) << 32ull) | ((uint64_t)low); \
 } while(0)

 //standard libsel4bench events
 #define SEL4BENCH_EVENT_CACHE_L1I_MISS    SEL4BENCH_IA32_EVENT_CACHE_L1I_MISS
 #define SEL4BENCH_EVENT_CACHE_L1D_MISS    SEL4BENCH_IA32_EVENT_CACHE_L1D_MISS
 #define SEL4BENCH_EVENT_TLB_L1I_MISS      SEL4BENCH_IA32_EVENT_TLB_L1I_MISS
 #define SEL4BENCH_EVENT_TLB_L1D_MISS      SEL4BENCH_IA32_EVENT_TLB_L1D_MISS
 #define SEL4BENCH_EVENT_MEMORY_ACCESS     SEL4BENCH_IA32_EVENT_MEMORY_ACCESS
 #define SEL4BENCH_EVENT_BRANCH_MISPREDICT SEL4BENCH_IA32_EVENT_BRANCH_MISPREDICT

 /* Intel docs are somewhat unclear as to exactly how to serialize PMCs.
  * Using LFENCE for the moment, because it's much faster. If event counts
  * turn out to be unreliable, switch to CPUID by uncommenting this line.
  *
  * This currently breaks the GCC register allocator.
  */
 //#define SEL4BENCH_STRICT_PMC_SERIALIZATION

 #include "sel4bench_private.h"

 #define CCNT_FORMAT "%llu"
 typedef uint64_t ccnt_t;

 /* The framework as it stands supports the following Intel processors:
  * - All P6-family processors (up to and including the Pentium M)
  * - All processors supporting IA-32 architectural performance
  *   monitoring (that is, processors starting from the Intel Core Solo,
  *   codenamed Yonah)
  * - Note: only works if compiled -m32. -m64 will break the inline
  *   assembly.
  */
 #if __x86_64__
     #error this code only works for a 32-bit architecture
 #endif

 /* Silence warnings about including the following functions when seL4_DebugRun
  * is not enabled when we are not calling them. If we actually call these
  * functions without seL4_DebugRun enabled, we'll get a link failure, so this
  * should be OK.
  */
 void seL4_DebugRun(void (* userfn) (void *), void* userarg);

 static FASTFN void sel4bench_init() {
 	seL4_Word cpuid_eax;
 	seL4_Word cpuid_ebx;
 	seL4_Word cpuid_ecx;
 	seL4_Word cpuid_edx;
 	sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &cpuid_eax, &cpuid_ebx, &cpuid_ecx, &cpuid_edx);

 	//check we're running on an Intel chip
 	assert(cpuid_ebx == IA32_CPUID_BASIC_MAGIC_EBX && cpuid_ecx == IA32_CPUID_BASIC_MAGIC_ECX && cpuid_edx == IA32_CPUID_BASIC_MAGIC_EDX);

 	//check that either we support architectural performance monitoring, or we're running on a P6-class chip
 	if(cpuid_eax < IA32_CPUID_LEAF_PMC) { //basic CPUID invocation tells us whether the processor supports arch PMCs
 		//if not, ensure we're on a P6-family processor
 		ia32_cpuid_model_info_t cpuid_model_info;
 		sel4bench_private_cpuid(IA32_CPUID_LEAF_MODEL, 0, &(cpuid_model_info.raw), &cpuid_ebx, &cpuid_ecx, &cpuid_edx);
 		assert(FAMILY(cpuid_model_info) == IA32_CPUID_FAMILY_P6);
 		if(!(FAMILY(cpuid_model_info) == IA32_CPUID_FAMILY_P6))
 			return;
 	}

 	//enable user-mode RDPMC
 	seL4_DebugRun(&sel4bench_private_enable_user_pmc, NULL);
 }

 static FASTFN sel4bench_counter_t sel4bench_get_cycle_count() {
 	sel4bench_private_serialize_pmc(); /* Serialise all preceding instructions */
 	uint64_t time = sel4bench_private_rdtsc();
 	sel4bench_private_serialize_pmc(); /* Serialise all following instructions */

 	return time;
 }

 static FASTFN seL4_Word sel4bench_get_num_counters() {
 	seL4_Word dummy;

 	//make sure the processor supports the PMC CPUID leaf
 	seL4_Word max_basic_leaf = 0;
 	sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &max_basic_leaf, &dummy, &dummy, &dummy);
 	if(max_basic_leaf >= IA32_CPUID_LEAF_PMC) { //Core Solo or later supports PMC discovery via CPUID...
 		//query the processor's PMC data
 		ia32_cpuid_leaf_pmc_eax_t pmc_eax;

 		sel4bench_private_cpuid(IA32_CPUID_LEAF_PMC, 0, &pmc_eax.raw, &dummy, &dummy, &dummy);
 		return pmc_eax.gp_pmc_count_per_core;
 	} else { //P6 (including Pentium M) doesn't...
 		ia32_cpuid_model_info_t model_info;

 		sel4bench_private_cpuid(IA32_CPUID_LEAF_MODEL, 0, &model_info.raw, &dummy, &dummy, &dummy);
 		assert(FAMILY(model_info) == IA32_CPUID_FAMILY_P6); //we only support P6 processors (P3, PM, ...)

 		return 2; //2 PMCs on P6
 	}
 }

 static FASTFN sel4bench_counter_t sel4bench_get_counter(seL4_Word counter) {
 	sel4bench_private_serialize_pmc();    /* Serialise all preceding instructions */
 	uint64_t counter_val = sel4bench_private_rdpmc(counter);
 	sel4bench_private_serialize_pmc();    /* Serialise all following instructions */

 	return counter_val;
 }

 //ASSUMES x86-32 -- NEEDS A REWRITE FOR x86-64!
 static CACHESENSFN sel4bench_counter_t sel4bench_get_counters(seL4_Word counters, sel4bench_counter_t* values) {
 	unsigned char counter = 0;

 	sel4bench_private_serialize_pmc();    /* Serialise all preceding instructions */
 	for(; counters != 0; counters >>= 1, counter++)
 		if(counters & 1)
 			values[counter] = sel4bench_private_rdpmc(counter);

 	uint64_t time = sel4bench_private_rdtsc();
 	sel4bench_private_serialize_pmc();    /* Serialise all following instructions */

 	return time;
 }

 static FASTFN void sel4bench_set_count_event(seL4_Word counter, seL4_Word event) {
 	//one implementation, because P6 and architectural PMCs work identically

 	assert(counter < sel4bench_get_num_counters());

 	//{RD,WR}MSR support data structure
 	seL4_Word msr_data[3];
 	msr_data[0] = IA32_MSR_PMC_PERFEVTSEL_BASE + counter;
 	msr_data[1] = 0;
 	msr_data[2] = 0;

 	//read current event-select MSR
 	seL4_DebugRun(&sel4bench_private_rdmsr, msr_data);

 	//preserve the reserved flag, like the docs tell us
 	seL4_Word res_flag = ((ia32_pmc_perfevtsel_t)msr_data[1]).res;

 	//rewrite the MSR to what we want
 	ia32_pmc_perfevtsel_t evtsel_msr;
 	evtsel_msr.raw   = 0;
 	evtsel_msr.event = sel4bench_private_lookup_event(event);
 	evtsel_msr.USR   = 1;
 	evtsel_msr.OS    = 1;
 	evtsel_msr.res   = res_flag;
 	msr_data[1] = evtsel_msr.raw;

 	assert(evtsel_msr.event != 0);

 	//write back to the event-select MSR
 	seL4_DebugRun(&sel4bench_private_wrmsr, msr_data);
 }

 static FASTFN void sel4bench_set_count_intx_bits(seL4_Word counter, bool in_tx, bool in_txcp) {
 	/* The Haswell uArch enhances perfevtsel_t with bit 32 to only count cycles in a RTM
 	   transactional region and bit 33 to exclude cycles in a RTM transactional region
 	   that abort in the cycle count. */
 	unsigned int in_tx_bit, in_txcp_bit;

 	assert(counter < sel4bench_get_num_counters());
 	assert( !(counter != 2 && in_txcp) );

 	//{RD,WR}MSR support data structure
 	seL4_Word msr_data[3];
 	msr_data[0] = IA32_MSR_PMC_PERFEVTSEL_BASE + counter;
 	msr_data[1] = 0;
 	msr_data[2] = 0;

 	//read current event-select MSR
 	seL4_DebugRun(&sel4bench_private_rdmsr, msr_data);

 	in_tx_bit = (in_tx ? IN_TX_BIT : 0);
 	in_txcp_bit = (in_txcp ? IN_TXCP_BIT : 0);
     msr_data[2] &= (~(IN_TX_BIT)) & (~(IN_TXCP_BIT));
 	msr_data[2] |= in_tx_bit | in_txcp_bit;

 	//write back to the event-select MSR
 	seL4_DebugRun(&sel4bench_private_wrmsr, msr_data);
 }

 static FASTFN void sel4bench_start_counters(seL4_Word counters) {
 	/* On P6, only the first counter has an enable flag, which controls both counters
 	 * simultaneously.
 	 * Arch PMCs are all done independently.
 	 */

 	seL4_Word num_counters = sel4bench_get_num_counters();
 	if(counters == ~(0UL)) {
 		counters = ((1 << num_counters) - 1);
 	} else {
 		assert((~((1 << num_counters) - 1) & counters) == 0);
 	}

 	seL4_Word max_basic_leaf = 0;
 	sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &max_basic_leaf, &num_counters, &num_counters, &num_counters);
 	//num_counters is now garbage

 	if(!(max_basic_leaf >= IA32_CPUID_LEAF_PMC)) {
 		//we're P6, because otherwise the init() assertion would have tripped
 		assert(counters == 0x3);
 		if(counters == 0x3)
 			counters = 1;
 		else
 			return;
 	}

 	//{RD,WR}MSR support data structure
 	seL4_Word msr_data[3];

 	seL4_Word counter;
 	//NOT your average for loop!
 	for(counter = 0; counters; counter++) {
 		if(!(counters & (1 << counter)))
 			continue;

 		counters &= ~(1 << counter);

 		//read appropriate MSR
 		msr_data[0] = IA32_MSR_PMC_PERFEVTSEL_BASE + counter;
 		msr_data[1] = 0;
 		msr_data[2] = 0;
 		seL4_DebugRun(&sel4bench_private_rdmsr, msr_data);

 		//twiddle enable bit
 		ia32_pmc_perfevtsel_t temp = { .raw = msr_data[1] };
 		temp.EN = 1;
 		msr_data[1] = temp.raw;

 		//write back appropriate MSR
 		seL4_DebugRun(&sel4bench_private_wrmsr, msr_data);

 		//zero the counter
 		msr_data[0] = IA32_MSR_PMC_PERFEVTCNT_BASE + counter;
 		msr_data[1] = 0;
 		msr_data[2] = 0;
 		seL4_DebugRun(&sel4bench_private_wrmsr, msr_data);
 	}

 }

 static FASTFN void sel4bench_stop_counters(seL4_Word counters) {
 	/* On P6, only the first counter has an enable flag, which controls both counters
 	 * simultaneously.
 	 * Arch PMCs are all done independently.
 	 */

 	seL4_Word num_counters = sel4bench_get_num_counters();
 	if(counters == ~(0UL)) {
 		counters = ((1 << num_counters) - 1);
 	} else {
 		assert((~((1 << num_counters) - 1) & counters) == 0);
 	}

 	seL4_Word max_basic_leaf = 0;
 	sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &max_basic_leaf, &num_counters, &num_counters, &num_counters);
 	//num_counters is now garbage

 	if(!(max_basic_leaf >= IA32_CPUID_LEAF_PMC)) {
 		//we're P6, because otherwise the init() assertion would have tripped
 		assert(counters == 0x3);
 		counters = 1;
 	}

 	//{RD,WR}MSR support data structure
 	seL4_Word msr_data[3];

 	seL4_Word counter;
 	//NOT your average for loop!
 	for(counter = 0; counters; counter++) {
 		if(!(counters & (1 << counter)))
 			continue;

 		counters &= ~(1 << counter);

 		//read appropriate MSR
 		msr_data[0] = IA32_MSR_PMC_PERFEVTSEL_BASE + counter;
 		msr_data[1] = 0;
 		msr_data[2] = 0;
 		seL4_DebugRun(&sel4bench_private_rdmsr, msr_data);

 		//twiddle enable bit
 		ia32_pmc_perfevtsel_t temp = { .raw = msr_data[1] };
 		temp.EN = 0;
 		msr_data[1] = temp.raw;

 		//write back appropriate MSR
 		seL4_DebugRun(&sel4bench_private_wrmsr, msr_data);
 	}
 }

 static FASTFN void sel4bench_destroy() {
 	//stop all performance counters
 	sel4bench_stop_counters(-1);

 	//disable user-mode RDPMC
 	seL4_DebugRun(&sel4bench_private_disable_user_pmc, NULL);
 }

 static FASTFN void sel4bench_reset_counters(seL4_Word counters) {
 	seL4_Word msr_data[3];
 	msr_data[0] = IA32_MSR_PMC_PERFEVTCNT_BASE;
 	msr_data[1] = 0;
 	msr_data[2] = 0;

 	unsigned char counter = 0;
 	for(; counters != 0; counters >>= 1, msr_data[0]++)
 		if(counter & 1)
 			seL4_DebugRun(&sel4bench_private_wrmsr, msr_data);
 }

 #endif /* __ARCH_SEL4BENCH_H__ */
	/*
	* Copyright 2014, NICTA
	*
	* This software may be distributed and modified according to the terms of
	* the BSD 2-Clause license. Note that NO WARRANTY is provided.
	* See "LICENSE_BSD2.txt" for details.
	*
	* @TAG(NICTA_BSD)
	*/

	#ifndef __ARCH_SEL4BENCH_H__
	#define __ARCH_SEL4BENCH_H__

	#include <assert.h>
	#include <stdio.h>
	#include <stdint.h>
	#include <stdbool.h>

	#define SEL4BENCH_READ_CCNT(var) do { \
	uint32_t low, high; \
	asm volatile( \
	"movl $0, %%eax \n" \
	"movl $0, %%ecx \n" \
	"cpuid \n" \
	"rdtsc \n" \
	"movl %%edx, %0 \n" \
	"movl %%eax, %1 \n" \
	"movl $0, %%eax \n" \
	"movl $0, %%ecx \n" \
	"cpuid \n" \
	: \
	"=r"(high), \
	"=r"(low) \
	: \
	: "eax", "ebx", "ecx", "edx" \
	); \
	(var) = (((uint64_t)high) << 32ull) \| ((uint64_t)low); \
	} while(0)

	//standard libsel4bench events
	#define SEL4BENCH_EVENT_CACHE_L1I_MISS SEL4BENCH_IA32_EVENT_CACHE_L1I_MISS
	#define SEL4BENCH_EVENT_CACHE_L1D_MISS SEL4BENCH_IA32_EVENT_CACHE_L1D_MISS
	#define SEL4BENCH_EVENT_TLB_L1I_MISS SEL4BENCH_IA32_EVENT_TLB_L1I_MISS
	#define SEL4BENCH_EVENT_TLB_L1D_MISS SEL4BENCH_IA32_EVENT_TLB_L1D_MISS
	#define SEL4BENCH_EVENT_MEMORY_ACCESS SEL4BENCH_IA32_EVENT_MEMORY_ACCESS
	#define SEL4BENCH_EVENT_BRANCH_MISPREDICT SEL4BENCH_IA32_EVENT_BRANCH_MISPREDICT

	/* Intel docs are somewhat unclear as to exactly how to serialize PMCs.
	* Using LFENCE for the moment, because it's much faster. If event counts
	* turn out to be unreliable, switch to CPUID by uncommenting this line.
	*
	* This currently breaks the GCC register allocator.
	*/
	//#define SEL4BENCH_STRICT_PMC_SERIALIZATION

	#include "sel4bench_private.h"

	#define CCNT_FORMAT "%llu"
	typedef uint64_t ccnt_t;

	/* The framework as it stands supports the following Intel processors:
	* - All P6-family processors (up to and including the Pentium M)
	* - All processors supporting IA-32 architectural performance
	* monitoring (that is, processors starting from the Intel Core Solo,
	* codenamed Yonah)
	* - Note: only works if compiled -m32. -m64 will break the inline
	* assembly.
	*/
	#if __x86_64__
	#error this code only works for a 32-bit architecture
	#endif

	/* Silence warnings about including the following functions when seL4_DebugRun
	* is not enabled when we are not calling them. If we actually call these
	* functions without seL4_DebugRun enabled, we'll get a link failure, so this
	* should be OK.
	*/
	void seL4_DebugRun(void (* userfn) (void ), void userarg);

	static FASTFN void sel4bench_init() {
	seL4_Word cpuid_eax;
	seL4_Word cpuid_ebx;
	seL4_Word cpuid_ecx;
	seL4_Word cpuid_edx;
	sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &cpuid_eax, &cpuid_ebx, &cpuid_ecx, &cpuid_edx);

	//check we're running on an Intel chip
	assert(cpuid_ebx == IA32_CPUID_BASIC_MAGIC_EBX && cpuid_ecx == IA32_CPUID_BASIC_MAGIC_ECX && cpuid_edx == IA32_CPUID_BASIC_MAGIC_EDX);

	//check that either we support architectural performance monitoring, or we're running on a P6-class chip
	if(cpuid_eax < IA32_CPUID_LEAF_PMC) { //basic CPUID invocation tells us whether the processor supports arch PMCs
	//if not, ensure we're on a P6-family processor
	ia32_cpuid_model_info_t cpuid_model_info;
	sel4bench_private_cpuid(IA32_CPUID_LEAF_MODEL, 0, &(cpuid_model_info.raw), &cpuid_ebx, &cpuid_ecx, &cpuid_edx);
	assert(FAMILY(cpuid_model_info) == IA32_CPUID_FAMILY_P6);
	if(!(FAMILY(cpuid_model_info) == IA32_CPUID_FAMILY_P6))
	return;
	}

	//enable user-mode RDPMC
	seL4_DebugRun(&sel4bench_private_enable_user_pmc, NULL);
	}

	static FASTFN sel4bench_counter_t sel4bench_get_cycle_count() {
	sel4bench_private_serialize_pmc(); /* Serialise all preceding instructions */
	uint64_t time = sel4bench_private_rdtsc();
	sel4bench_private_serialize_pmc(); /* Serialise all following instructions */

	return time;
	}

	static FASTFN seL4_Word sel4bench_get_num_counters() {
	seL4_Word dummy;

	//make sure the processor supports the PMC CPUID leaf
	seL4_Word max_basic_leaf = 0;
	sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &max_basic_leaf, &dummy, &dummy, &dummy);
	if(max_basic_leaf >= IA32_CPUID_LEAF_PMC) { //Core Solo or later supports PMC discovery via CPUID...
	//query the processor's PMC data
	ia32_cpuid_leaf_pmc_eax_t pmc_eax;

	sel4bench_private_cpuid(IA32_CPUID_LEAF_PMC, 0, &pmc_eax.raw, &dummy, &dummy, &dummy);
	return pmc_eax.gp_pmc_count_per_core;
	} else { //P6 (including Pentium M) doesn't...
	ia32_cpuid_model_info_t model_info;

	sel4bench_private_cpuid(IA32_CPUID_LEAF_MODEL, 0, &model_info.raw, &dummy, &dummy, &dummy);
	assert(FAMILY(model_info) == IA32_CPUID_FAMILY_P6); //we only support P6 processors (P3, PM, ...)

	return 2; //2 PMCs on P6
	}
	}

	static FASTFN sel4bench_counter_t sel4bench_get_counter(seL4_Word counter) {
	sel4bench_private_serialize_pmc(); /* Serialise all preceding instructions */
	uint64_t counter_val = sel4bench_private_rdpmc(counter);
	sel4bench_private_serialize_pmc(); /* Serialise all following instructions */

	return counter_val;
	}

	//ASSUMES x86-32 -- NEEDS A REWRITE FOR x86-64!
	static CACHESENSFN sel4bench_counter_t sel4bench_get_counters(seL4_Word counters, sel4bench_counter_t* values) {
	unsigned char counter = 0;

	sel4bench_private_serialize_pmc(); /* Serialise all preceding instructions */
	for(; counters != 0; counters >>= 1, counter++)
	if(counters & 1)
	values[counter] = sel4bench_private_rdpmc(counter);

	uint64_t time = sel4bench_private_rdtsc();
	sel4bench_private_serialize_pmc(); /* Serialise all following instructions */

	return time;
	}

	static FASTFN void sel4bench_set_count_event(seL4_Word counter, seL4_Word event) {
	//one implementation, because P6 and architectural PMCs work identically

	assert(counter < sel4bench_get_num_counters());

	//{RD,WR}MSR support data structure
	seL4_Word msr_data[3];
	msr_data[0] = IA32_MSR_PMC_PERFEVTSEL_BASE + counter;
	msr_data[1] = 0;
	msr_data[2] = 0;

	//read current event-select MSR
	seL4_DebugRun(&sel4bench_private_rdmsr, msr_data);

	//preserve the reserved flag, like the docs tell us
	seL4_Word res_flag = ((ia32_pmc_perfevtsel_t)msr_data[1]).res;

	//rewrite the MSR to what we want
	ia32_pmc_perfevtsel_t evtsel_msr;
	evtsel_msr.raw = 0;
	evtsel_msr.event = sel4bench_private_lookup_event(event);
	evtsel_msr.USR = 1;
	evtsel_msr.OS = 1;
	evtsel_msr.res = res_flag;
	msr_data[1] = evtsel_msr.raw;

	assert(evtsel_msr.event != 0);

	//write back to the event-select MSR
	seL4_DebugRun(&sel4bench_private_wrmsr, msr_data);
	}

	static FASTFN void sel4bench_set_count_intx_bits(seL4_Word counter, bool in_tx, bool in_txcp) {
	/* The Haswell uArch enhances perfevtsel_t with bit 32 to only count cycles in a RTM
	transactional region and bit 33 to exclude cycles in a RTM transactional region
	that abort in the cycle count. */
	unsigned int in_tx_bit, in_txcp_bit;

	assert(counter < sel4bench_get_num_counters());
	assert( !(counter != 2 && in_txcp) );

	//{RD,WR}MSR support data structure
	seL4_Word msr_data[3];
	msr_data[0] = IA32_MSR_PMC_PERFEVTSEL_BASE + counter;
	msr_data[1] = 0;
	msr_data[2] = 0;

	//read current event-select MSR
	seL4_DebugRun(&sel4bench_private_rdmsr, msr_data);

	in_tx_bit = (in_tx ? IN_TX_BIT : 0);
	in_txcp_bit = (in_txcp ? IN_TXCP_BIT : 0);
	msr_data[2] &= (~(IN_TX_BIT)) & (~(IN_TXCP_BIT));
	msr_data[2] \|= in_tx_bit \| in_txcp_bit;

	//write back to the event-select MSR
	seL4_DebugRun(&sel4bench_private_wrmsr, msr_data);
	}

	static FASTFN void sel4bench_start_counters(seL4_Word counters) {
	/* On P6, only the first counter has an enable flag, which controls both counters
	* simultaneously.
	* Arch PMCs are all done independently.
	*/

	seL4_Word num_counters = sel4bench_get_num_counters();
	if(counters == ~(0UL)) {
	counters = ((1 << num_counters) - 1);
	} else {
	assert((~((1 << num_counters) - 1) & counters) == 0);
	}

	seL4_Word max_basic_leaf = 0;
	sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &max_basic_leaf, &num_counters, &num_counters, &num_counters);
	//num_counters is now garbage

	if(!(max_basic_leaf >= IA32_CPUID_LEAF_PMC)) {
	//we're P6, because otherwise the init() assertion would have tripped
	assert(counters == 0x3);
	if(counters == 0x3)
	counters = 1;
	else
	return;
	}

	//{RD,WR}MSR support data structure
	seL4_Word msr_data[3];

	seL4_Word counter;
	//NOT your average for loop!
	for(counter = 0; counters; counter++) {
	if(!(counters & (1 << counter)))
	continue;

	counters &= ~(1 << counter);

	//read appropriate MSR
	msr_data[0] = IA32_MSR_PMC_PERFEVTSEL_BASE + counter;
	msr_data[1] = 0;
	msr_data[2] = 0;
	seL4_DebugRun(&sel4bench_private_rdmsr, msr_data);

	//twiddle enable bit
	ia32_pmc_perfevtsel_t temp = { .raw = msr_data[1] };
	temp.EN = 1;
	msr_data[1] = temp.raw;

	//write back appropriate MSR
	seL4_DebugRun(&sel4bench_private_wrmsr, msr_data);

	//zero the counter
	msr_data[0] = IA32_MSR_PMC_PERFEVTCNT_BASE + counter;
	msr_data[1] = 0;
	msr_data[2] = 0;
	seL4_DebugRun(&sel4bench_private_wrmsr, msr_data);
	}

	}

	static FASTFN void sel4bench_stop_counters(seL4_Word counters) {
	/* On P6, only the first counter has an enable flag, which controls both counters
	* simultaneously.
	* Arch PMCs are all done independently.
	*/

	seL4_Word num_counters = sel4bench_get_num_counters();
	if(counters == ~(0UL)) {
	counters = ((1 << num_counters) - 1);
	} else {
	assert((~((1 << num_counters) - 1) & counters) == 0);
	}

	seL4_Word max_basic_leaf = 0;
	sel4bench_private_cpuid(IA32_CPUID_LEAF_BASIC, 0, &max_basic_leaf, &num_counters, &num_counters, &num_counters);
	//num_counters is now garbage

	if(!(max_basic_leaf >= IA32_CPUID_LEAF_PMC)) {
	//we're P6, because otherwise the init() assertion would have tripped
	assert(counters == 0x3);
	counters = 1;
	}

	//{RD,WR}MSR support data structure
	seL4_Word msr_data[3];

	seL4_Word counter;
	//NOT your average for loop!
	for(counter = 0; counters; counter++) {
	if(!(counters & (1 << counter)))
	continue;

	counters &= ~(1 << counter);

	//read appropriate MSR
	msr_data[0] = IA32_MSR_PMC_PERFEVTSEL_BASE + counter;
	msr_data[1] = 0;
	msr_data[2] = 0;
	seL4_DebugRun(&sel4bench_private_rdmsr, msr_data);

	//twiddle enable bit
	ia32_pmc_perfevtsel_t temp = { .raw = msr_data[1] };
	temp.EN = 0;
	msr_data[1] = temp.raw;

	//write back appropriate MSR
	seL4_DebugRun(&sel4bench_private_wrmsr, msr_data);
	}
	}

	static FASTFN void sel4bench_destroy() {
	//stop all performance counters
	sel4bench_stop_counters(-1);

	//disable user-mode RDPMC
	seL4_DebugRun(&sel4bench_private_disable_user_pmc, NULL);
	}

	static FASTFN void sel4bench_reset_counters(seL4_Word counters) {
	seL4_Word msr_data[3];
	msr_data[0] = IA32_MSR_PMC_PERFEVTCNT_BASE;
	msr_data[1] = 0;
	msr_data[2] = 0;

	unsigned char counter = 0;
	for(; counters != 0; counters >>= 1, msr_data[0]++)
	if(counter & 1)
	seL4_DebugRun(&sel4bench_private_wrmsr, msr_data);
	}

	#endif /* __ARCH_SEL4BENCH_H__ */