[PATCH] performance counters 3.1 for 2.5.45 [1/4]: x86 support

Mikael Pettersson (mikpe@csd.uu.se)
Fri, 1 Nov 2002 00:10:28 +0100 (MET)


Linus,

This patch set contains an updated version of the performance
monitoring counters driver I sent before.

The main changes are:
- the code is now safe for preemptible kernels
- a more direct system call interface
- dropped 2.4 compat files and #includes

Please consider this for inclusion in the 2.5 kernel.

The organisation of the files has changed, so the patch kits
are in a different order from before. The first three can be
applied in any order, but the last (kernel changes) requires
that the previous three have been applied.

/Mikael

This is part 1 of 4 of perfctr-3.1 for the 2.5.45 kernel:
the x86 low-level driver.

drivers/perfctr/x86.c | 1548 +++++++++++++++++++++++++++++++++++++++++++++
include/asm-i386/perfctr.h | 192 +++++
2 files changed, 1740 insertions(+)

diff -uN linux-2.5.45/drivers/perfctr/x86.c linux-2.5.45.perfctr-3.1/drivers/perfctr/x86.c
--- linux-2.5.45/drivers/perfctr/x86.c Thu Jan 1 01:00:00 1970
+++ linux-2.5.45.perfctr-3.1/drivers/perfctr/x86.c Thu Oct 31 23:16:59 2002
@@ -0,0 +1,1548 @@
+/* $Id: x86.c,v 1.74 2002/10/31 22:16:59 mikpe Exp $
+ * x86 performance-monitoring counters driver.
+ *
+ * Copyright (C) 1999-2002 Mikael Pettersson
+ */
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+
+#include <asm/perfctr.h>
+#include <asm/msr.h>
+#undef MSR_P6_PERFCTR0
+#include <asm/apic.h>
+#include <asm/timex.h>
+
+/* missing from <asm-i386/cpufeature.h> */
+#define cpu_has_msr boot_cpu_has(X86_FEATURE_MSR)
+
+struct perfctr_cpu_info perfctr_cpu_info;
+
+/* Support for lazy evntsel and perfctr MSR updates. */
+struct per_cpu_cache { /* roughly a subset of perfctr_cpu_state */
+ union {
+ unsigned int p5_cesr;
+ unsigned int id; /* cache owner id */
+ } k1;
+ struct {
+ /* NOTE: these caches have physical indices, not virtual */
+ unsigned int evntsel[18];
+ unsigned int escr[0x3E2-0x3A0];
+ unsigned int pebs_enable;
+ unsigned int pebs_matrix_vert;
+ } control;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+static DEFINE_PER_CPU(struct per_cpu_cache, per_cpu_cache);
+
+/* Intel P5, Cyrix 6x86MX/MII/III, Centaur WinChip C6/2/3 */
+#define MSR_P5_CESR 0x11
+#define MSR_P5_CTR0 0x12 /* .. 0x13 */
+#define P5_CESR_CPL 0x00C0
+#define P5_CESR_RESERVED (~0x01FF)
+#define MII_CESR_RESERVED (~0x05FF)
+#define C6_CESR_RESERVED (~0x00FF)
+
+/* Intel P6, VIA C3 */
+#define MSR_P6_PERFCTR0 0xC1 /* .. 0xC2 */
+#define MSR_P6_EVNTSEL0 0x186 /* .. 0x187 */
+#define P6_EVNTSEL_ENABLE 0x00400000
+#define P6_EVNTSEL_INT 0x00100000
+#define P6_EVNTSEL_CPL 0x00030000
+#define P6_EVNTSEL_RESERVED 0x00280000
+#define VC3_EVNTSEL1_RESERVED (~0x1FF)
+
+/* AMD K7 */
+#define MSR_K7_EVNTSEL0 0xC0010000 /* .. 0xC0010003 */
+#define MSR_K7_PERFCTR0 0xC0010004 /* .. 0xC0010007 */
+
+/* Intel P4 */
+#define MSR_P4_MISC_ENABLE 0x1A0
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) /* read-only status bit */
+#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) /* read-only status bit */
+#define MSR_P4_PERFCTR0 0x300 /* .. 0x311 */
+#define MSR_P4_CCCR0 0x360 /* .. 0x371 */
+#define MSR_P4_ESCR0 0x3A0 /* .. 0x3E1, with some gaps */
+
+#define MSR_P4_PEBS_ENABLE 0x3F1
+#define P4_PE_REPLAY_TAG_BITS 0x00000607
+#define P4_PE_UOP_TAG 0x01000000
+#define P4_PE_RESERVED 0xFEFFF9F8 /* only allow ReplayTagging */
+
+#define MSR_P4_PEBS_MATRIX_VERT 0x3F2
+#define P4_PMV_REPLAY_TAG_BITS 0x00000003
+#define P4_PMV_RESERVED 0xFFFFFFFC
+
+#define P4_CCCR_OVF 0x80000000
+#define P4_CCCR_CASCADE 0x40000000
+#define P4_CCCR_OVF_PMI_T1 0x08000000
+#define P4_CCCR_OVF_PMI_T0 0x04000000
+#define P4_CCCR_ACTIVE_THREAD 0x00030000
+#define P4_CCCR_ENABLE 0x00001000
+#define P4_CCCR_ESCR_SELECT(X) (((X) >> 13) & 0x7)
+#define P4_CCCR_RESERVED (0x30000FFF|P4_CCCR_OVF|P4_CCCR_OVF_PMI_T1)
+#define P4_CCCR_REQUIRED P4_CCCR_ACTIVE_THREAD
+
+#define P4_ESCR_CPL_T1 0x00000003
+#define P4_ESCR_CPL_T0 0x0000000C /* must be non-zero */
+#define P4_ESCR_TAG_ENABLE 0x00000010
+#define P4_ESCR_RESERVED (0x80000000|P4_ESCR_CPL_T1)
+
+#define P4_FAST_RDPMC 0x80000000
+#define P4_MASK_FAST_RDPMC 0x0000001F /* we only need low 5 bits */
+
+#define rdmsrl(msr,low) \
+ __asm__ __volatile__("rdmsr" : "=a"(low) : "c"(msr) : "edx")
+#define rdpmcl(ctr,low) \
+ __asm__ __volatile__("rdpmc" : "=a"(low) : "c"(ctr) : "edx")
+
+static void clear_msr_range(unsigned int base, unsigned int n)
+{
+ unsigned int i;
+
+ for(i = 0; i < n; ++i)
+ wrmsr(base+i, 0, 0);
+}
+
+static inline void set_in_cr4_local(unsigned int mask)
+{
+ write_cr4(read_cr4() | mask);
+}
+
+static inline void clear_in_cr4_local(unsigned int mask)
+{
+ write_cr4(read_cr4() & ~mask);
+}
+
+static unsigned int new_id(void)
+{
+ static spinlock_t lock = SPIN_LOCK_UNLOCKED;
+ static unsigned int counter;
+ int id;
+
+ spin_lock(&lock);
+ id = ++counter;
+ spin_unlock(&lock);
+ return id;
+}
+
+/****************************************************************
+ * *
+ * Interrupt support via the local APIC. *
+ * *
+ ****************************************************************/
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void perfctr_default_ihandler(unsigned long pc)
+{
+}
+
+static perfctr_ihandler_t perfctr_ihandler = perfctr_default_ihandler;
+
+void do_perfctr_interrupt(struct pt_regs *regs)
+{
+ /* PREEMPT note: invoked via an interrupt gate, which
+ masks interrupts. We're still on the originating CPU. */
+ /* XXX: recursive interrupts? delay the ACK, mask LVTPC, or queue? */
+ ack_APIC_irq();
+ irq_enter();
+ (*perfctr_ihandler)(regs->eip);
+ irq_exit();
+}
+
+void perfctr_cpu_set_ihandler(perfctr_ihandler_t ihandler)
+{
+ perfctr_ihandler = ihandler ? ihandler : perfctr_default_ihandler;
+}
+
+static inline int perfctr_has_ictrs(const struct perfctr_cpu_state *state)
+{
+ return perfctr_cstatus_has_ictrs(state->cstatus);
+}
+
+static inline void perfctr_set_lvtpc(unsigned int vector)
+{
+ if( cpu_has_apic )
+ apic_write(APIC_LVTPC, vector);
+}
+
+#else /* CONFIG_X86_LOCAL_APIC */
+static inline int perfctr_has_ictrs(const struct perfctr_cpu_state *state)
+{
+ return 0;
+}
+static inline void perfctr_set_lvtpc(unsigned int vector) { }
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_SMP /* which implies CONFIG_X86_LOCAL_APIC */
+static inline void set_isuspend_cpu(struct perfctr_cpu_state *state,
+ const struct per_cpu_cache *cpu)
+{
+ state->k2.isuspend_cpu = cpu;
+}
+
+static inline int is_isuspend_cpu(const struct perfctr_cpu_state *state,
+ const struct per_cpu_cache *cpu)
+{
+ return state->k2.isuspend_cpu == cpu;
+}
+
+static inline void clear_isuspend_cpu(struct perfctr_cpu_state *state)
+{
+ state->k2.isuspend_cpu = NULL;
+}
+#else /* CONFIG_SMP */
+static inline void set_isuspend_cpu(struct perfctr_cpu_state *state,
+ const struct per_cpu_cache *cpu) { }
+static inline int is_isuspend_cpu(const struct perfctr_cpu_state *state,
+ const struct per_cpu_cache *cpu) { return 1; }
+static inline void clear_isuspend_cpu(struct perfctr_cpu_state *state) { }
+#endif /* CONFIG_SMP */
+
+/****************************************************************
+ * *
+ * Driver procedures. *
+ * *
+ ****************************************************************/
+
+/*
+ * Intel P5 family (Pentium, family code 5).
+ * - One TSC and two 40-bit PMCs.
+ * - A single 32-bit CESR (MSR 0x11) controls both PMCs.
+ * CESR has two halves, each controlling one PMC.
+ * To keep the API reasonably clean, the user puts 16 bits of
+ * control data in each counter's evntsel; the driver combines
+ * these to a single 32-bit CESR value.
+ * - Overflow interrupts are not available.
+ * - Pentium MMX added the RDPMC instruction. RDPMC has lower
+ * overhead than RDMSR and it can be used in user-mode code.
+ * - The MMX events are not symmetric: some events are only available
+ * for some PMC, and some event codes denote different events
+ * depending on which PMCs they control.
+ */
+
+/* shared with MII and C6 */
+static int p5_like_check_control(struct perfctr_cpu_state *state,
+ unsigned int reserved_bits, int is_c6)
+{
+ unsigned short cesr_half[2];
+ unsigned int pmc, evntsel, i;
+
+ if( state->control.nrictrs != 0 || state->control.nractrs > 2 )
+ return -EINVAL;
+ cesr_half[0] = 0;
+ cesr_half[1] = 0;
+ for(i = 0; i < state->control.nractrs; ++i) {
+ pmc = state->control.pmc_map[i];
+ if( pmc > 1 || cesr_half[pmc] != 0 )
+ return -EINVAL;
+ evntsel = state->control.evntsel[i];
+ /* protect reserved bits */
+ if( (evntsel & reserved_bits) != 0 )
+ return -EPERM;
+ /* the CPL field (if defined) must be non-zero */
+ if( !is_c6 && !(evntsel & P5_CESR_CPL) )
+ return -EINVAL;
+ cesr_half[pmc] = evntsel;
+ }
+ state->k1.p5_cesr = (cesr_half[1] << 16) | cesr_half[0];
+ return 0;
+}
+
+static int p5_check_control(struct perfctr_cpu_state *state)
+{
+ return p5_like_check_control(state, P5_CESR_RESERVED, 0);
+}
+
+/* shared with MII but not C6 */
+static void p5_write_control(const struct perfctr_cpu_state *state)
+{
+ struct per_cpu_cache *cpu;
+ unsigned int cesr;
+
+ cesr = state->k1.p5_cesr;
+ if( !cesr ) /* no PMC is on (this test doesn't work on C6) */
+ return;
+ cpu = &per_cpu(per_cpu_cache, smp_processor_id());
+ if( cpu->k1.p5_cesr != cesr ) {
+ cpu->k1.p5_cesr = cesr;
+ wrmsr(MSR_P5_CESR, cesr, 0);
+ }
+}
+
+static void p5_read_counters(const struct perfctr_cpu_state *state,
+ struct perfctr_low_ctrs *ctrs)
+{
+ unsigned int cstatus, nrctrs, i;
+
+ /* The P5 doesn't allocate a cache line on a write miss, so do
+ a dummy read to avoid a write miss here _and_ a read miss
+ later in our caller. */
+ asm("" : : "r"(ctrs->tsc));
+
+ cstatus = state->cstatus;
+ if( perfctr_cstatus_has_tsc(cstatus) )
+ rdtscl(ctrs->tsc);
+ nrctrs = perfctr_cstatus_nractrs(cstatus);
+ for(i = 0; i < nrctrs; ++i) {
+ unsigned int pmc = state->control.pmc_map[i];
+ rdmsrl(MSR_P5_CTR0+pmc, ctrs->pmc[i]);
+ }
+}
+
+/* used by P5MMX, MII, C6, VC3, P6, K7, P4 */
+static void rdpmc_read_counters(const struct perfctr_cpu_state *state,
+ struct perfctr_low_ctrs *ctrs)
+{
+ unsigned int cstatus, nrctrs, i;
+
+ cstatus = state->cstatus;
+ if( perfctr_cstatus_has_tsc(cstatus) )
+ rdtscl(ctrs->tsc);
+ nrctrs = perfctr_cstatus_nractrs(cstatus);
+ for(i = 0; i < nrctrs; ++i) {
+ unsigned int pmc = state->control.pmc_map[i];
+ rdpmcl(pmc, ctrs->pmc[i]);
+ }
+}
+
+/* shared with MII and C6 */
+static void p5_clear_counters(void)
+{
+ clear_msr_range(MSR_P5_CESR, 1+2);
+}
+
+/*
+ * Cyrix 6x86/MII/III.
+ * - Same MSR assignments as P5 MMX. Has RDPMC and two 48-bit PMCs.
+ * - Event codes and CESR formatting as in the plain P5 subset.
+ * - Many but not all P5 MMX event codes are implemented.
+ * - Cyrix adds a few more event codes. The event code is widened
+ * to 7 bits, and Cyrix puts the high bit in CESR bit 10
+ * (and CESR bit 26 for PMC1).
+ */
+
+static int mii_check_control(struct perfctr_cpu_state *state)
+{
+ return p5_like_check_control(state, MII_CESR_RESERVED, 0);
+}
+
+/*
+ * Centaur WinChip C6/2/3.
+ * - Same MSR assignments as P5 MMX. Has RDPMC and two 40-bit PMCs.
+ * - CESR is formatted with two halves, like P5. However, there
+ * are no defined control fields for e.g. CPL selection, and
+ * there is no defined method for stopping the counters.
+ * - Only a few event codes are defined.
+ * - The 64-bit TSC is synthesised from the low 32 bits of the
+ * two PMCs, and CESR has to be set up appropriately.
+ * Reprogramming CESR causes RDTSC to yield invalid results.
+ * (The C6 may also hang in this case, due to C6 erratum I-13.)
+ * Therefore, using the PMCs on any of these processors requires
+ * that the TSC is not accessed at all:
+ * 1. The kernel must be configured or a TSC-less processor, i.e.
+ * generic 586 or less.
+ * 2. The "notsc" boot parameter must be passed to the kernel.
+ * 3. User-space libraries and code must also be configured and
+ * compiled for a generic 586 or less.
+ */
+
+#if !defined(CONFIG_X86_TSC)
+static int c6_check_control(struct perfctr_cpu_state *state)
+{
+ if( state->control.tsc_on )
+ return -EINVAL;
+ return p5_like_check_control(state, C6_CESR_RESERVED, 1);
+}
+
+static void c6_write_control(const struct perfctr_cpu_state *state)
+{
+ struct per_cpu_cache *cpu;
+ unsigned int cesr;
+
+ if( perfctr_cstatus_nractrs(state->cstatus) == 0 ) /* no PMC is on */
+ return;
+ cpu = &per_cpu(per_cpu_cache, smp_processor_id());
+ cesr = state->k1.p5_cesr;
+ if( cpu->k1.p5_cesr != cesr ) {
+ cpu->k1.p5_cesr = cesr;
+ wrmsr(MSR_P5_CESR, cesr, 0);
+ }
+}
+#endif
+
+/*
+ * Intel P6 family (Pentium Pro, Pentium II, and Pentium III cores,
+ * and Xeon and Celeron versions of Pentium II and III cores).
+ * - One TSC and two 40-bit PMCs.
+ * - One 32-bit EVNTSEL MSR for each PMC.
+ * - EVNTSEL0 contains a global enable/disable bit.
+ * That bit is reserved in EVNTSEL1.
+ * - Each EVNTSEL contains a CPL field.
+ * - Overflow interrupts are possible, but requires that the
+ * local APIC is available. Some Mobile P6s have no local APIC.
+ * - The PMCs cannot be initialised with arbitrary values, since
+ * wrmsr fills the high bits by sign-extending from bit 31.
+ * - Most events are symmetric, but a few are not.
+ */
+
+/* shared with K7 */
+static int p6_like_check_control(struct perfctr_cpu_state *state, int is_k7)
+{
+ unsigned int evntsel, i, nractrs, nrctrs, pmc_mask, pmc;
+
+ nractrs = state->control.nractrs;
+ nrctrs = nractrs + state->control.nrictrs;
+ if( nrctrs < nractrs || nrctrs > (is_k7 ? 4 : 2) )
+ return -EINVAL;
+
+ pmc_mask = 0;
+ for(i = 0; i < nrctrs; ++i) {
+ pmc = state->control.pmc_map[i];
+ if( pmc >= (is_k7 ? 4 : 2) || (pmc_mask & (1<<pmc)) )
+ return -EINVAL;
+ pmc_mask |= (1<<pmc);
+ evntsel = state->control.evntsel[i];
+ /* protect reserved bits */
+ if( evntsel & P6_EVNTSEL_RESERVED )
+ return -EPERM;
+ /* check ENable bit */
+ if( is_k7 ) {
+ /* ENable bit must be set in each evntsel */
+ if( !(evntsel & P6_EVNTSEL_ENABLE) )
+ return -EINVAL;
+ } else {
+ /* only evntsel[0] has the ENable bit */
+ if( evntsel & P6_EVNTSEL_ENABLE ) {
+ if( pmc > 0 )
+ return -EPERM;
+ } else {
+ if( pmc == 0 )
+ return -EINVAL;
+ }
+ }
+ /* the CPL field must be non-zero */
+ if( !(evntsel & P6_EVNTSEL_CPL) )
+ return -EINVAL;
+ /* INT bit must be off for a-mode and on for i-mode counters */
+ if( evntsel & P6_EVNTSEL_INT ) {
+ if( i < nractrs )
+ return -EINVAL;
+ } else {
+ if( i >= nractrs )
+ return -EINVAL;
+ }
+ }
+ state->k1.id = new_id();
+ return 0;
+}
+
+static int p6_check_control(struct perfctr_cpu_state *state)
+{
+ return p6_like_check_control(state, 0);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/* PRE: perfctr_cstatus_has_ictrs(state->cstatus) != 0 */
+/* shared with K7 */
+static void p6_like_isuspend(struct perfctr_cpu_state *state,
+ unsigned int msr_evntsel0)
+{
+ struct per_cpu_cache *cpu;
+ unsigned int cstatus, nrctrs, i;
+
+ cpu = &per_cpu(per_cpu_cache, smp_processor_id());
+ cstatus = state->cstatus;
+ nrctrs = perfctr_cstatus_nrctrs(cstatus);
+ for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) {
+ unsigned int pmc, now;
+ pmc = state->control.pmc_map[i];
+ cpu->control.evntsel[pmc] = 0;
+ wrmsr(msr_evntsel0+pmc, 0, 0);
+ rdpmcl(pmc, now);
+ state->sum.pmc[i] += now - state->start.pmc[i];
+ state->start.pmc[i] = now;
+ }
+ /* cpu->k1.id is still == state->k1.id */
+ set_isuspend_cpu(state, cpu);
+}
+
+/* PRE: perfctr_cstatus_has_ictrs(state->cstatus) != 0 */
+/* shared with K7 */
+static void p6_like_iresume(const struct perfctr_cpu_state *state,
+ unsigned int msr_perfctr0)
+{
+ struct per_cpu_cache *cpu;
+ unsigned int cstatus, nrctrs, i;
+
+ cpu = &per_cpu(per_cpu_cache, smp_processor_id());
+ if( cpu->k1.id == state->k1.id ) {
+ cpu->k1.id = 0; /* force reload of cleared EVNTSELs */
+ if( is_isuspend_cpu(state, cpu) )
+ return; /* skip reload of PERFCTRs */
+ }
+ cstatus = state->cstatus;
+ nrctrs = perfctr_cstatus_nrctrs(cstatus);
+ for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) {
+ unsigned int pmc = state->control.pmc_map[i];
+ /* the -1 is correct for K7 but ignored by P6 */
+ wrmsr(msr_perfctr0+pmc, state->start.pmc[i], -1);
+ }
+ /* cpu->k1.id remains != state->k1.id */
+}
+
+static void p6_isuspend(struct perfctr_cpu_state *state)
+{
+ p6_like_isuspend(state, MSR_P6_EVNTSEL0);
+}
+
+static void p6_iresume(const struct perfctr_cpu_state *state)
+{
+ p6_like_iresume(state, MSR_P6_PERFCTR0);
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+/* shared with K7 and VC3 */
+static void p6_like_write_control(const struct perfctr_cpu_state *state,
+ unsigned int msr_evntsel0)
+{
+ struct per_cpu_cache *cpu;
+ unsigned int nrctrs, i;
+
+ cpu = &per_cpu(per_cpu_cache, smp_processor_id());
+ if( cpu->k1.id == state->k1.id )
+ return;
+ nrctrs = perfctr_cstatus_nrctrs(state->cstatus);
+ for(i = 0; i < nrctrs; ++i) {
+ unsigned int evntsel = state->control.evntsel[i];
+ unsigned int pmc = state->control.pmc_map[i];
+ if( evntsel != cpu->control.evntsel[pmc] ) {
+ cpu->control.evntsel[pmc] = evntsel;
+ wrmsr(msr_evntsel0+pmc, evntsel, 0);
+ }
+ }
+ cpu->k1.id = state->k1.id;
+}
+
+/* shared with VC3 */
+static void p6_write_control(const struct perfctr_cpu_state *state)
+{
+ p6_like_write_control(state, MSR_P6_EVNTSEL0);
+}
+
+static void p6_clear_counters(void)
+{
+ clear_msr_range(MSR_P6_EVNTSEL0, 2);
+ clear_msr_range(MSR_P6_PERFCTR0, 2);
+}
+
+/*
+ * AMD K7 family (Athlon, Duron).
+ * - Somewhat similar to the Intel P6 family.
+ * - Four 48-bit PMCs.
+ * - Four 32-bit EVNTSEL MSRs with similar layout as in P6.
+ * - Completely different MSR assignments :-(
+ * - Fewer countable events defined :-(
+ * - The events appear to be completely symmetric.
+ * - The EVNTSEL MSRs are symmetric since each has its own enable bit.
+ * - Publicly available documentation is incomplete.
+ */
+
+static int k7_check_control(struct perfctr_cpu_state *state)
+{
+ return p6_like_check_control(state, 1);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void k7_isuspend(struct perfctr_cpu_state *state)
+{
+ p6_like_isuspend(state, MSR_K7_EVNTSEL0);
+}
+
+static void k7_iresume(const struct perfctr_cpu_state *state)
+{
+ p6_like_iresume(state, MSR_K7_PERFCTR0);
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+static void k7_write_control(const struct perfctr_cpu_state *state)
+{
+ p6_like_write_control(state, MSR_K7_EVNTSEL0);
+}
+
+static void k7_clear_counters(void)
+{
+ clear_msr_range(MSR_K7_EVNTSEL0, 4+4);
+}
+
+/*
+ * VIA C3 family.
+ * - A Centaur design somewhat similar to the P6/Celeron.
+ * - PERFCTR0 is an alias for the TSC, and EVNTSEL0 is read-only.
+ * - PERFCTR1 is 32 bits wide.
+ * - EVNTSEL1 has no defined control fields, and there is no
+ * defined method for stopping the counter.
+ * - According to testing, the reserved fields in EVNTSEL1 have
+ * no function. We always fill them with zeroes.
+ * - Only a few event codes are defined.
+ * - No local APIC or interrupt-mode support.
+ * - pmc_map[0] must be 1, if nractrs == 1.
+ */
+static int vc3_check_control(struct perfctr_cpu_state *state)
+{
+ if( state->control.nrictrs || state->control.nractrs > 1 )
+ return -EINVAL;
+ if( state->control.nractrs == 1 ) {
+ if( state->control.pmc_map[0] != 1 )
+ return -EINVAL;
+ if( state->control.evntsel[0] & VC3_EVNTSEL1_RESERVED )
+ return -EPERM;
+ state->k1.id = state->control.evntsel[0];
+ } else
+ state->k1.id = 0;
+ return 0;
+}
+
+static void vc3_clear_counters(void)
+{
+ /* Not documented, but seems to be default after boot. */
+ wrmsr(MSR_P6_EVNTSEL0+1, 0x00070079, 0);
+}
+
+/*
+ * Intel Pentium 4.
+ * Current implementation restrictions:
+ * - No DS/PEBS support.
+ */
+
+/*
+ * Table 15-4 in the IA32 Volume 3 manual contains a 18x8 entry mapping
+ * from counter/CCCR number (0-17) and ESCR SELECT value (0-7) to the
+ * actual ESCR MSR number. This mapping contains some repeated patterns,
+ * so we can compact it to a 4x8 table of MSR offsets:
+ *
+ * 1. CCCRs 16 and 17 are mapped just like CCCRs 13 and 14, respectively.
+ * Thus, we only consider the 16 CCCRs 0-15.
+ * 2. The CCCRs are organised in pairs, and both CCCRs in a pair use the
+ * same mapping. Thus, we only consider the 8 pairs 0-7.
+ * 3. In each pair of pairs, the second odd-numbered pair has the same domain
+ * as the first even-numbered pair, and the range is 1+ the range of the
+ * the first even-numbered pair. For example, CCCR(0) and (1) map ESCR
+ * SELECT(7) to 0x3A0, and CCCR(2) and (3) map it to 0x3A1.
+ * The only exception is that pair (7) [CCCRs 14 and 15] does not have
+ * ESCR SELECT(3) in its domain, like pair (6) [CCCRs 12 and 13] has.
+ * NOTE: Revisions of IA32 Volume 3 older than #245472-007 had an error
+ * in this table: CCCRs 12, 13, and 16 had their mappings for ESCR SELECT
+ * values 2 and 3 swapped.
+ * 4. All MSR numbers are on the form 0x3??. Instead of storing these as
+ * 16-bit numbers, the table only stores the 8-bit offsets from 0x300.
+ */
+
+static const unsigned char p4_cccr_escr_map[4][8] = {
+ /* 0x00 and 0x01 as is, 0x02 and 0x03 are +1 */
+ [0x00/4] { [7] 0xA0,
+ [6] 0xA2,
+ [2] 0xAA,
+ [4] 0xAC,
+ [0] 0xB2,
+ [1] 0xB4,
+ [3] 0xB6,
+ [5] 0xC8, },
+ /* 0x04 and 0x05 as is, 0x06 and 0x07 are +1 */
+ [0x04/4] { [0] 0xC0,
+ [2] 0xC2,
+ [1] 0xC4, },
+ /* 0x08 and 0x09 as is, 0x0A and 0x0B are +1 */
+ [0x08/4] { [1] 0xA4,
+ [0] 0xA6,
+ [5] 0xA8,
+ [2] 0xAE,
+ [3] 0xB0, },
+ /* 0x0C, 0x0D, and 0x10 as is,
+ 0x0E, 0x0F, and 0x11 are +1 except [3] is not in the domain */
+ [0x0C/4] { [4] 0xB8,
+ [5] 0xCC,
+ [6] 0xE0,
+ [0] 0xBA,
+ [2] 0xBC,
+ [3] 0xBE,
+ [1] 0xCA, },
+};
+
+static unsigned int p4_escr_addr(unsigned int pmc, unsigned int cccr_val)
+{
+ unsigned int escr_select, pair, escr_offset;
+
+ escr_select = P4_CCCR_ESCR_SELECT(cccr_val);
+ if( pmc > 0x11 )
+ return 0; /* pmc range error */
+ if( pmc > 0x0F )
+ pmc -= 3; /* 0 <= pmc <= 0x0F */
+ pair = pmc / 2; /* 0 <= pair <= 7 */
+ escr_offset = p4_cccr_escr_map[pair / 2][escr_select];
+ if( !escr_offset || (pair == 7 && escr_select == 3) )
+ return 0; /* ESCR SELECT range error */
+ return escr_offset + (pair & 1) + 0x300;
+};
+
+static int p4_check_control(struct perfctr_cpu_state *state)
+{
+ unsigned int i, nractrs, nrctrs, pmc_mask;
+
+ nractrs = state->control.nractrs;
+ nrctrs = nractrs + state->control.nrictrs;
+ if( nrctrs < nractrs || nrctrs > 18 )
+ return -EINVAL;
+
+ pmc_mask = 0;
+ for(i = 0; i < nrctrs; ++i) {
+ unsigned int pmc, cccr_val, escr_val, escr_addr;
+ /* check that pmc_map[] is well-defined;
+ pmc_map[i] is what we pass to RDPMC, the PMC itself
+ is extracted by masking off the FAST_RDPMC flag */
+ pmc = state->control.pmc_map[i] & ~P4_FAST_RDPMC;
+ if( pmc >= 18 || (pmc_mask & (1<<pmc)) )
+ return -EINVAL;
+ pmc_mask |= (1<<pmc);
+ /* check CCCR contents */
+ cccr_val = state->control.evntsel[i];
+ if( cccr_val & P4_CCCR_RESERVED )
+ return -EPERM;
+ if( (cccr_val & P4_CCCR_REQUIRED) != P4_CCCR_REQUIRED )
+ return -EINVAL;
+ if( !(cccr_val & (P4_CCCR_ENABLE | P4_CCCR_CASCADE)) )
+ return -EINVAL;
+ if( cccr_val & P4_CCCR_OVF_PMI_T0 ) {
+ if( i < nractrs )
+ return -EINVAL;
+ } else {
+ if( i >= nractrs )
+ return -EINVAL;
+ }
+ /* check ESCR contents */
+ escr_val = state->control.evntsel_aux[i];
+ if( escr_val & P4_ESCR_RESERVED )
+ return -EPERM;
+ if( !(escr_val & P4_ESCR_CPL_T0) )
+ return -EINVAL;
+ /* compute and cache ESCR address */
+ escr_addr = p4_escr_addr(pmc, cccr_val);
+ if( !escr_addr )
+ return -EINVAL; /* ESCR SELECT range error */
+ /* XXX: Two counters could map to the same ESCR. Should we
+ check that they use the same ESCR value? */
+ state->k2.p4_escr_map[i] = escr_addr - MSR_P4_ESCR0;
+ }
+ /* check ReplayTagging control (PEBS_ENABLE and PEBS_MATRIX_VERT) */
+ if( state->control.p4.pebs_enable ) {
+ if( !nrctrs )
+ return -EPERM;
+ if( state->control.p4.pebs_enable & P4_PE_RESERVED )
+ return -EPERM;
+ if( !(state->control.p4.pebs_enable & P4_PE_UOP_TAG ) )
+ return -EINVAL;
+ if( !(state->control.p4.pebs_enable & P4_PE_REPLAY_TAG_BITS) )
+ return -EINVAL;
+ if( state->control.p4.pebs_matrix_vert & P4_PMV_RESERVED )
+ return -EPERM;
+ if( !(state->control.p4.pebs_matrix_vert & P4_PMV_REPLAY_TAG_BITS) )
+ return -EINVAL;
+ } else if( state->control.p4.pebs_matrix_vert )
+ return -EPERM;
+ state->k1.id = new_id();
+ return 0;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/* PRE: perfctr_cstatus_has_ictrs(state->cstatus) != 0 */
+/* XXX: merge with p6_like_isuspend() later */
+static void p4_isuspend(struct perfctr_cpu_state *state)
+{
+ struct per_cpu_cache *cpu;
+ unsigned int cstatus, nrctrs, i;
+
+ cpu = &per_cpu(per_cpu_cache, smp_processor_id());
+ cstatus = state->cstatus;
+ nrctrs = perfctr_cstatus_nrctrs(cstatus);
+ for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) {
+ unsigned int pmc_raw, pmc_idx, now;
+ pmc_raw = state->control.pmc_map[i];
+ pmc_idx = pmc_raw & P4_MASK_FAST_RDPMC;
+ cpu->control.evntsel[pmc_idx] = 0;
+ wrmsr(MSR_P4_CCCR0+pmc_idx, 0, 0); /* P4 quirk: also clear OVF */
+ rdpmcl(pmc_raw, now);
+ state->sum.pmc[i] += now - state->start.pmc[i];
+ state->start.pmc[i] = now;
+ }
+ /* cpu->k1.id is still == state->k1.id */
+ set_isuspend_cpu(state, cpu);
+ /* another P4 quirk: must unmask LVTPC */
+ apic_write(APIC_LVTPC, LOCAL_PERFCTR_VECTOR);
+}
+
+/* PRE: perfctr_cstatus_has_ictrs(state->cstatus) != 0 */
+/* XXX: merge with p6_like_iresume() later */
+static void p4_iresume(const struct perfctr_cpu_state *state)
+{
+ struct per_cpu_cache *cpu;
+ unsigned int cstatus, nrctrs, i;
+
+ cpu = &per_cpu(per_cpu_cache, smp_processor_id());
+ if( cpu->k1.id == state->k1.id ) {
+ cpu->k1.id = 0; /* force reload of cleared EVNTSELs */
+ if( is_isuspend_cpu(state, cpu) )
+ return; /* skip reload of PERFCTRs */
+ }
+ cstatus = state->cstatus;
+ nrctrs = perfctr_cstatus_nrctrs(cstatus);
+ for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) {
+ unsigned int pmc = state->control.pmc_map[i] & P4_MASK_FAST_RDPMC;
+ wrmsr(MSR_P4_PERFCTR0+pmc, state->start.pmc[i], -1);
+ }
+ /* cpu->k1.id remains != state->k1.id */
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+static void p4_write_control(const struct perfctr_cpu_state *state)
+{
+ struct per_cpu_cache *cpu;
+ unsigned int nrctrs, i;
+
+ cpu = &per_cpu(per_cpu_cache, smp_processor_id());
+ if( cpu->k1.id == state->k1.id )
+ return;
+ nrctrs = perfctr_cstatus_nrctrs(state->cstatus);
+ for(i = 0; i < nrctrs; ++i) {
+ unsigned int escr_val, escr_off, cccr_val, pmc;
+ escr_val = state->control.evntsel_aux[i];
+ escr_off = state->k2.p4_escr_map[i];
+ if( escr_val != cpu->control.escr[escr_off] ) {
+ cpu->control.escr[escr_off] = escr_val;
+ wrmsr(MSR_P4_ESCR0+escr_off, escr_val, 0);
+ }
+ cccr_val = state->control.evntsel[i];
+ pmc = state->control.pmc_map[i] & P4_MASK_FAST_RDPMC;
+ if( cccr_val != cpu->control.evntsel[pmc] ) {
+ cpu->control.evntsel[pmc] = cccr_val;
+ wrmsr(MSR_P4_CCCR0+pmc, cccr_val, 0);
+ }
+ }
+ if( state->control.p4.pebs_enable != cpu->control.pebs_enable ) {
+ cpu->control.pebs_enable = state->control.p4.pebs_enable;
+ wrmsr(MSR_P4_PEBS_ENABLE, cpu->control.pebs_enable, 0);
+ }
+ if( state->control.p4.pebs_matrix_vert != cpu->control.pebs_matrix_vert ) {
+ cpu->control.pebs_matrix_vert = state->control.p4.pebs_matrix_vert;
+ wrmsr(MSR_P4_PEBS_MATRIX_VERT, cpu->control.pebs_matrix_vert, 0);
+ }
+ cpu->k1.id = state->k1.id;
+}
+
+static void p4_clear_counters(void)
+{
+ /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
+ docs doesn't fully define it, so leave it alone for now. */
+ /* clear PEBS_ENABLE and PEBS_MATRIX_VERT; they handle both PEBS
+ and ReplayTagging, and should exist even if PEBS is disabled */
+ clear_msr_range(0x3F1, 2);
+ clear_msr_range(0x3A0, 31);
+ clear_msr_range(0x3C0, 6);
+ clear_msr_range(0x3C8, 6);
+ clear_msr_range(0x3E0, 2);
+ clear_msr_range(MSR_P4_CCCR0, 18);
+ clear_msr_range(MSR_P4_PERFCTR0, 18);
+}
+
+/*
+ * Generic driver for any x86 with a working TSC.
+ */
+
+static int generic_check_control(struct perfctr_cpu_state *state)
+{
+ if( state->control.nractrs || state->control.nrictrs )
+ return -EINVAL;
+ return 0;
+}
+
+static void generic_write_control(const struct perfctr_cpu_state *state)
+{
+}
+
+static void generic_read_counters(const struct perfctr_cpu_state *state,
+ struct perfctr_low_ctrs *ctrs)
+{
+ rdtscl(ctrs->tsc);
+}
+
+static void generic_clear_counters(void)
+{
+}
+
+/*
+ * Driver methods, internal and exported.
+ *
+ * Frequently called functions (write_control, read_counters,
+ * isuspend and iresume) are back-patched to invoke the correct
+ * processor-specific methods directly, thereby saving the
+ * overheads of indirect function calls.
+ *
+ * Backpatchable call sites must have been "finalised" after
+ * initialisation. The reason for this is that unsynchronised code
+ * modification doesn't work in multiprocessor systems, due to
+ * Intel P6 errata. Consequently, all backpatchable call sites
+ * must be known and local to this file.
+ */
+
+static int redirect_call_disable;
+
+static void redirect_call(void *ra, void *to)
+{
+ /* XXX: make this function __init later */
+ if( redirect_call_disable )
+ printk(KERN_ERR __FILE__ ":%s: unresolved call to %p at %p\n",
+ __FUNCTION__, to, ra);
+ /* we can only redirect `call near relative' instructions */
+ if( *((unsigned char*)ra - 5) != 0xE8 ) {
+ printk(KERN_WARNING __FILE__ ":%s: unable to redirect caller %p to %p\n",
+ __FUNCTION__, ra, to);
+ return;
+ }
+ *(int*)((char*)ra - 4) = (char*)to - (char*)ra;
+}
+
+static void (*write_control)(const struct perfctr_cpu_state*);
+static void perfctr_cpu_write_control(const struct perfctr_cpu_state *state)
+{
+ redirect_call(__builtin_return_address(0), write_control);
+ return write_control(state);
+}
+
+static void (*read_counters)(const struct perfctr_cpu_state*,
+ struct perfctr_low_ctrs*);
+static void perfctr_cpu_read_counters(const struct perfctr_cpu_state *state,
+ struct perfctr_low_ctrs *ctrs)
+{
+ redirect_call(__builtin_return_address(0), read_counters);
+ return read_counters(state, ctrs);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void (*cpu_isuspend)(struct perfctr_cpu_state*);
+static void perfctr_cpu_isuspend(struct perfctr_cpu_state *state)
+{
+ redirect_call(__builtin_return_address(0), cpu_isuspend);
+ return cpu_isuspend(state);
+}
+
+static void (*cpu_iresume)(const struct perfctr_cpu_state*);
+static void perfctr_cpu_iresume(const struct perfctr_cpu_state *state)
+{
+ redirect_call(__builtin_return_address(0), cpu_iresume);
+ return cpu_iresume(state);
+}
+
+/* Call perfctr_cpu_ireload() just before perfctr_cpu_resume() to
+ bypass internal caching and force a reload if the I-mode PMCs. */
+void perfctr_cpu_ireload(struct perfctr_cpu_state *state)
+{
+#if !defined(CONFIG_SMP)
+ /* This works because we know the process owning 'state'
+ * will be resumed on the current CPU.
+ */
+ per_cpu(per_cpu_cache, smp_processor_id()).k1.id = 0;
+#else
+ /* This works because of the additional is_isuspend_cpu() check
+ * that iresume() performs on SMP. The UP version above fails
+ * when iresume() is issued via remote control on SMP, since we
+ * don't know on which CPU the process owning 'state' will be
+ * resumed. Inferior alternatives are:
+ * 1. Allocate a new control id to 'state', but that is ugly.
+ * 2. Invalidate cpu->k1.id on every CPU, but this would penalise
+ * other processes, require a locking protocol for cpu->k1.id
+ * accesses, or require an expensive smp_call_function() call
+ * to perform the update atomically on all CPUs.
+ */
+ clear_isuspend_cpu(state);
+#endif
+}
+
+/* PRE: the counters have been suspended and sampled by perfctr_cpu_suspend() */
+unsigned int perfctr_cpu_identify_overflow(struct perfctr_cpu_state *state)
+{
+ unsigned int cstatus, nrctrs, pmc, pmc_mask;
+
+ cstatus = state->cstatus;
+ pmc = perfctr_cstatus_nractrs(cstatus);
+ nrctrs = perfctr_cstatus_nrctrs(cstatus);
+
+ for(pmc_mask = 0; pmc < nrctrs; ++pmc) {
+ if( (int)state->start.pmc[pmc] >= 0 ) { /* XXX: ">" ? */
+ /* XXX: "+=" to correct for overshots */
+ state->start.pmc[pmc] = state->control.ireset[pmc];
+ pmc_mask |= (1 << pmc);
+ /* On a P4 we should now clear the OVF flag in the
+ counter's CCCR. However, p4_isuspend() already
+ did that as a side-effect of clearing the CCCR
+ in order to stop the i-mode counters. */
+ }
+ }
+ return pmc_mask;
+}
+#else /* CONFIG_X86_LOCAL_APIC */
+static inline void perfctr_cpu_isuspend(struct perfctr_cpu_state *state) { }
+static inline void perfctr_cpu_iresume(const struct perfctr_cpu_state *state) { }
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+static inline int check_ireset(const struct perfctr_cpu_state *state)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ unsigned int nrctrs, i;
+
+ i = state->control.nractrs;
+ nrctrs = i + state->control.nrictrs;
+ for(; i < nrctrs; ++i)
+ if( state->control.ireset[i] >= 0 )
+ return -EINVAL;
+#endif
+ return 0;
+}
+
+static inline void setup_imode_start_values(struct perfctr_cpu_state *state)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ unsigned int cstatus, nrctrs, i;
+
+ cstatus = state->cstatus;
+ nrctrs = perfctr_cstatus_nrctrs(cstatus);
+ for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i)
+ state->start.pmc[i] = state->control.ireset[i];
+#endif
+}
+
+static int (*check_control)(struct perfctr_cpu_state*);
+int perfctr_cpu_update_control(struct perfctr_cpu_state *state)
+{
+ int err;
+
+ clear_isuspend_cpu(state);
+ state->cstatus = 0;
+
+ /* disallow i-mode counters if we cannot catch the interrupts */
+ if( !(perfctr_cpu_info.features & PERFCTR_FEATURE_PCINT)
+ && state->control.nrictrs )
+ return -EPERM;
+
+ err = check_control(state);
+ if( err < 0 )
+ return err;
+ err = check_ireset(state);
+ if( err < 0 )
+ return err;
+ state->cstatus = perfctr_mk_cstatus(state->control.tsc_on,
+ state->control.nractrs,
+ state->control.nrictrs);
+ setup_imode_start_values(state);
+ return 0;
+}
+
+void perfctr_cpu_suspend(struct perfctr_cpu_state *state)
+{
+ unsigned int i, cstatus, nractrs;
+ struct perfctr_low_ctrs now;
+
+ if( perfctr_has_ictrs(state) )
+ perfctr_cpu_isuspend(state);
+ perfctr_cpu_read_counters(state, &now);
+ cstatus = state->cstatus;
+ if( perfctr_cstatus_has_tsc(cstatus) )
+ state->sum.tsc += now.tsc - state->start.tsc;
+ nractrs = perfctr_cstatus_nractrs(cstatus);
+ for(i = 0; i < nractrs; ++i)
+ state->sum.pmc[i] += now.pmc[i] - state->start.pmc[i];
+}
+
+void perfctr_cpu_resume(struct perfctr_cpu_state *state)
+{
+ if( perfctr_has_ictrs(state) )
+ perfctr_cpu_iresume(state);
+ perfctr_cpu_write_control(state);
+ perfctr_cpu_read_counters(state, &state->start);
+ /* XXX: if (SMP && start.tsc == now.tsc) ++now.tsc; */
+}
+
+void perfctr_cpu_sample(struct perfctr_cpu_state *state)
+{
+ unsigned int i, cstatus, nractrs;
+ struct perfctr_low_ctrs now;
+
+ perfctr_cpu_read_counters(state, &now);
+ cstatus = state->cstatus;
+ if( perfctr_cstatus_has_tsc(cstatus) ) {
+ state->sum.tsc += now.tsc - state->start.tsc;
+ state->start.tsc = now.tsc;
+ }
+ nractrs = perfctr_cstatus_nractrs(cstatus);
+ for(i = 0; i < nractrs; ++i) {
+ state->sum.pmc[i] += now.pmc[i] - state->start.pmc[i];
+ state->start.pmc[i] = now.pmc[i];
+ }
+}
+
+static void (*clear_counters)(void);
+static void perfctr_cpu_clear_counters(void)
+{
+ return clear_counters();
+}
+
+/****************************************************************
+ * *
+ * Processor detection and initialisation procedures. *
+ * *
+ ****************************************************************/
+
+/* see comment above at redirect_call() */
+static void __init finalise_backpatching(void)
+{
+ struct per_cpu_cache *cpu;
+ struct perfctr_cpu_state state;
+
+ cpu = &per_cpu(per_cpu_cache, smp_processor_id());
+ memset(cpu, 0, sizeof *cpu);
+ memset(&state, 0, sizeof state);
+ state.cstatus =
+ (perfctr_cpu_info.features & PERFCTR_FEATURE_PCINT)
+ ? perfctr_mk_cstatus(0, 0, 1)
+ : 0;
+ perfctr_cpu_sample(&state);
+ perfctr_cpu_resume(&state);
+ perfctr_cpu_suspend(&state);
+
+ redirect_call_disable = 1;
+}
+
+static int __init intel_init(void)
+{
+ unsigned int misc_enable;
+
+ if( !cpu_has_tsc )
+ return -ENODEV;
+ switch( current_cpu_data.x86 ) {
+ case 5:
+ if( cpu_has_mmx ) {
+ perfctr_cpu_info.type = PERFCTR_X86_INTEL_P5MMX;
+ read_counters = rdpmc_read_counters;
+
+ /* Avoid Pentium Erratum 74. */
+ if( current_cpu_data.x86_model == 4 &&
+ (current_cpu_data.x86_mask == 4 ||
+ (current_cpu_data.x86_mask == 3 &&
+ ((cpuid_eax(1) >> 12) & 0x3) == 1)) )
+ perfctr_cpu_info.features &= ~PERFCTR_FEATURE_RDPMC;
+ } else {
+ perfctr_cpu_info.type = PERFCTR_X86_INTEL_P5;
+ perfctr_cpu_info.features &= ~PERFCTR_FEATURE_RDPMC;
+ read_counters = p5_read_counters;
+ }
+ write_control = p5_write_control;
+ check_control = p5_check_control;
+ clear_counters = p5_clear_counters;
+ return 0;
+ case 6:
+ if( current_cpu_data.x86_model >= 7 ) /* PIII */
+ perfctr_cpu_info.type = PERFCTR_X86_INTEL_PIII;
+ else if( current_cpu_data.x86_model >= 3 ) /* PII or Celeron */
+ perfctr_cpu_info.type = PERFCTR_X86_INTEL_PII;
+ else {
+ perfctr_cpu_info.type = PERFCTR_X86_INTEL_P6;
+
+ /* Avoid Pentium Pro Erratum 26. */
+ if( current_cpu_data.x86_mask < 9 )
+ perfctr_cpu_info.features &= ~PERFCTR_FEATURE_RDPMC;
+ }
+ read_counters = rdpmc_read_counters;
+ write_control = p6_write_control;
+ check_control = p6_check_control;
+ clear_counters = p6_clear_counters;
+#ifdef CONFIG_X86_LOCAL_APIC
+ if( cpu_has_apic ) {
+ perfctr_cpu_info.features |= PERFCTR_FEATURE_PCINT;
+ cpu_isuspend = p6_isuspend;
+ cpu_iresume = p6_iresume;
+ }
+#endif
+ return 0;
+ case 15: /* Pentium 4 */
+ rdmsrl(MSR_P4_MISC_ENABLE, misc_enable);
+ if( !(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL) )
+ break;
+ if( current_cpu_data.x86_model >= 2 )
+ /* Model 2 changed the ESCR Event Mask programming
+ details for several events. */
+ perfctr_cpu_info.type = PERFCTR_X86_INTEL_P4M2;
+ else
+ perfctr_cpu_info.type = PERFCTR_X86_INTEL_P4;
+ read_counters = rdpmc_read_counters;
+ write_control = p4_write_control;
+ check_control = p4_check_control;
+ clear_counters = p4_clear_counters;
+#ifdef CONFIG_X86_LOCAL_APIC
+ if( cpu_has_apic ) {
+ perfctr_cpu_info.features |= PERFCTR_FEATURE_PCINT;
+ cpu_isuspend = p4_isuspend;
+ cpu_iresume = p4_iresume;
+ }
+#endif
+ return 0;
+ }
+ return -ENODEV;
+}
+
+static int __init amd_init(void)
+{
+ if( !cpu_has_tsc )
+ return -ENODEV;
+ switch( current_cpu_data.x86 ) {
+ case 6: /* K7. Model 1 does not have a local APIC.
+ AMD Document #22007 Revision J hints that APIC-less
+ K7s signal overflows as debug interrupts. */
+ perfctr_cpu_info.type = PERFCTR_X86_AMD_K7;
+ read_counters = rdpmc_read_counters;
+ write_control = k7_write_control;
+ check_control = k7_check_control;
+ clear_counters = k7_clear_counters;
+#ifdef CONFIG_X86_LOCAL_APIC
+ if( cpu_has_apic ) {
+ perfctr_cpu_info.features |= PERFCTR_FEATURE_PCINT;
+ cpu_isuspend = k7_isuspend;
+ cpu_iresume = k7_iresume;
+ }
+#endif
+ return 0;
+ }
+ return -ENODEV;
+}
+
+static int __init cyrix_init(void)
+{
+ if( !cpu_has_tsc )
+ return -ENODEV;
+ switch( current_cpu_data.x86 ) {
+ case 6: /* 6x86MX, MII, or III */
+ perfctr_cpu_info.type = PERFCTR_X86_CYRIX_MII;
+ read_counters = rdpmc_read_counters;
+ write_control = p5_write_control;
+ check_control = mii_check_control;
+ clear_counters = p5_clear_counters;
+ return 0;
+ }
+ return -ENODEV;
+}
+
+static int __init centaur_init(void)
+{
+ switch( current_cpu_data.x86 ) {
+#if !defined(CONFIG_X86_TSC)
+ case 5:
+ switch( current_cpu_data.x86_model ) {
+ case 4: /* WinChip C6 */
+ perfctr_cpu_info.type = PERFCTR_X86_WINCHIP_C6;
+ break;
+ case 8: /* WinChip 2, 2A, or 2B */
+ case 9: /* WinChip 3, a 2A with larger cache and lower voltage */
+ perfctr_cpu_info.type = PERFCTR_X86_WINCHIP_2;
+ break;
+ default:
+ return -ENODEV;
+ }
+ /*
+ * TSC must be inaccessible for perfctrs to work.
+ */
+ if( !(read_cr4() & X86_CR4_TSD) || cpu_has_tsc )
+ return -ENODEV;
+ perfctr_cpu_info.features &= ~PERFCTR_FEATURE_RDTSC;
+ read_counters = rdpmc_read_counters;
+ write_control = c6_write_control;
+ check_control = c6_check_control;
+ clear_counters = p5_clear_counters;
+ return 0;
+#endif
+ case 6: /* VIA C3 */
+ if( !cpu_has_tsc )
+ return -ENODEV;
+ switch( current_cpu_data.x86_model ) {
+ case 6: /* VIA C3 (Cyrix III) */
+ case 7: /* VIA C3 Samuel 2 or Ezra */
+ case 8: /* VIA C3 Ezra-T */
+ break;
+ default:
+ return -ENODEV;
+ }
+ perfctr_cpu_info.type = PERFCTR_X86_VIA_C3;
+ read_counters = rdpmc_read_counters;
+ write_control = p6_write_control;
+ check_control = vc3_check_control;
+ clear_counters = vc3_clear_counters;
+ return 0;
+ }
+ return -ENODEV;
+}
+
+static int __init generic_init(void)
+{
+ if( !cpu_has_tsc )
+ return -ENODEV;
+ perfctr_cpu_info.features &= ~PERFCTR_FEATURE_RDPMC;
+ perfctr_cpu_info.type = PERFCTR_X86_GENERIC;
+ check_control = generic_check_control;
+ write_control = generic_write_control;
+ read_counters = generic_read_counters;
+ clear_counters = generic_clear_counters;
+ return 0;
+}
+
+static char generic_name[] __initdata = "Generic x86 with TSC";
+static char p5_name[] __initdata = "Intel Pentium";
+static char p5mmx_name[] __initdata = "Intel Pentium MMX";
+static char p6_name[] __initdata = "Intel Pentium Pro";
+static char pii_name[] __initdata = "Intel Pentium II";
+static char piii_name[] __initdata = "Intel Pentium III";
+static char mii_name[] __initdata = "Cyrix 6x86MX/MII/III";
+static char wcc6_name[] __initdata = "WinChip C6";
+static char wc2_name[] __initdata = "WinChip 2/3";
+static char k7_name[] __initdata = "AMD K7";
+static char vc3_name[] __initdata = "VIA C3";
+static char p4_name[] __initdata = "Intel Pentium 4";
+static char p4m2_name[] __initdata = "Intel Pentium 4 Model 2";
+
+char *perfctr_cpu_name[] __initdata = {
+ [PERFCTR_X86_GENERIC] generic_name,
+ [PERFCTR_X86_INTEL_P5] p5_name,
+ [PERFCTR_X86_INTEL_P5MMX] p5mmx_name,
+ [PERFCTR_X86_INTEL_P6] p6_name,
+ [PERFCTR_X86_INTEL_PII] pii_name,
+ [PERFCTR_X86_INTEL_PIII] piii_name,
+ [PERFCTR_X86_CYRIX_MII] mii_name,
+ [PERFCTR_X86_WINCHIP_C6] wcc6_name,
+ [PERFCTR_X86_WINCHIP_2] wc2_name,
+ [PERFCTR_X86_AMD_K7] k7_name,
+ [PERFCTR_X86_VIA_C3] vc3_name,
+ [PERFCTR_X86_INTEL_P4] p4_name,
+ [PERFCTR_X86_INTEL_P4M2] p4m2_name,
+};
+
+static void __init perfctr_cpu_init_one(void *ignore)
+{
+ /* PREEMPT note: when called via smp_call_function(),
+ this is in IRQ context with preemption disabled. */
+ perfctr_cpu_clear_counters();
+ perfctr_set_lvtpc(LOCAL_PERFCTR_VECTOR);
+ if( perfctr_cpu_info.features & PERFCTR_FEATURE_RDPMC )
+ set_in_cr4_local(X86_CR4_PCE);
+}
+
+static void __exit perfctr_cpu_exit_one(void *ignore)
+{
+ /* PREEMPT note: when called via smp_call_function(),
+ this is in IRQ context with preemption disabled. */
+ perfctr_cpu_clear_counters();
+ perfctr_set_lvtpc(APIC_DM_NMI | APIC_LVT_MASKED);
+ if( perfctr_cpu_info.features & PERFCTR_FEATURE_RDPMC )
+ clear_in_cr4_local(X86_CR4_PCE);
+}
+
+static void invalidate_per_cpu_cache(void)
+{
+ /*
+ * per_cpu_cache[] is initialised to contain "impossible"
+ * evntsel values guaranteed to differ from anything accepted
+ * by perfctr_cpu_update_control(). This way, initialisation of
+ * a CPU's evntsel MSRs will happen automatically the first time
+ * perfctr_cpu_write_control() executes on it.
+ * All-bits-one works for all currently supported processors.
+ * The memset also sets the ids to -1, which is intentional.
+ */
+ unsigned int i;
+ for(i = 0; i < NR_CPUS; ++i)
+ if( cpu_online(i) )
+ memset(&per_cpu(per_cpu_cache, i), ~0,
+ sizeof(struct per_cpu_cache));
+}
+
+/****************************************************************
+ * *
+ * NMI watchdog interface. *
+ * *
+ ****************************************************************/
+
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PM)
+
+static void __init unregister_nmi_pmdev(void)
+{
+ if( nmi_pmdev ) {
+ apic_pm_unregister(nmi_pmdev);
+ nmi_pmdev = 0;
+ }
+}
+
+static int x86_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data)
+{
+ /* XXX: incomplete */
+ return 0;
+}
+
+static struct pm_dev *x86_pmdev;
+
+static void __init x86_pm_init(void)
+{
+ x86_pmdev = apic_pm_register(PM_SYS_DEV, 0, x86_pm_callback);
+}
+
+static void __exit x86_pm_exit(void)
+{
+ if( x86_pmdev ) {
+ apic_pm_unregister(x86_pmdev);
+ x86_pmdev = NULL;
+ }
+}
+
+#else /* CONFIG_X86_LOCAL_APIC && CONFIG_PM */
+static inline void unregister_nmi_pmdev(void) { }
+static inline void x86_pm_init(void) { }
+static inline void x86_pm_exit(void) { }
+#endif /* CONFIG_X86_LOCAL_APIC && CONFIG_PM */
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void __init disable_nmi_watchdog(void)
+{
+ if( nmi_perfctr_msr ) {
+ nmi_perfctr_msr = 0;
+ printk(KERN_NOTICE "perfctr: disabled nmi_watchdog\n");
+ unregister_nmi_pmdev();
+ }
+}
+#else
+static inline void disable_nmi_watchdog(void) { }
+#endif
+
+/****************************************************************
+ * *
+ * Top-level initialisation. *
+ * *
+ ****************************************************************/
+
+int __init perfctr_cpu_init(void)
+{
+ int err = -ENODEV;
+
+ preempt_disable();
+
+ /* RDPMC and RDTSC are on by default. They will be disabled
+ by the init procedures if necessary. */
+ perfctr_cpu_info.features = PERFCTR_FEATURE_RDPMC | PERFCTR_FEATURE_RDTSC;
+
+ if( cpu_has_msr ) {
+ switch( current_cpu_data.x86_vendor ) {
+ case X86_VENDOR_INTEL:
+ err = intel_init();
+ break;
+ case X86_VENDOR_AMD:
+ err = amd_init();
+ break;
+ case X86_VENDOR_CYRIX:
+ err = cyrix_init();
+ break;
+ case X86_VENDOR_CENTAUR:
+ err = centaur_init();
+ }
+ }
+ if( err ) {
+ err = generic_init(); /* last resort */
+ if( err )
+ goto out;
+ }
+ /*
+ * Put the hardware in a sane state:
+ * - finalise resolution of backpatchable call sites
+ * - clear perfctr MSRs
+ * - set up APIC_LVTPC
+ * - set CR4.PCE [on permanently due to __flush_tlb_global()]
+ * - install our default interrupt handler
+ */
+ if( perfctr_cpu_info.features & PERFCTR_FEATURE_RDPMC )
+ mmu_cr4_features |= X86_CR4_PCE;
+ finalise_backpatching();
+ perfctr_cpu_init_one(NULL);
+ smp_call_function(perfctr_cpu_init_one, NULL, 1, 1);
+ perfctr_cpu_set_ihandler(NULL);
+ /*
+ * Fix up the connection to the local APIC:
+ * - disable and disconnect the NMI watchdog
+ * - register our PM callback
+ */
+ disable_nmi_watchdog();
+ x86_pm_init();
+
+ invalidate_per_cpu_cache();
+
+ perfctr_cpu_info.khz = cpu_khz;
+ perfctr_cpu_info.nrcpus = num_online_cpus();
+
+ out:
+ preempt_enable();
+ return err;
+}
+
+void __exit perfctr_cpu_exit(void)
+{
+ preempt_disable();
+ if( perfctr_cpu_info.features & PERFCTR_FEATURE_RDPMC )
+ mmu_cr4_features &= ~X86_CR4_PCE;
+ perfctr_cpu_exit_one(NULL);
+ smp_call_function(perfctr_cpu_exit_one, NULL, 1, 1);
+ perfctr_cpu_set_ihandler(NULL);
+ x86_pm_exit();
+ /* XXX: restart nmi watchdog? */
+ preempt_enable();
+}
+
+/****************************************************************
+ * *
+ * Hardware reservation for higher-level drivers. *
+ * *
+ ****************************************************************/
+
+/* XXX: hmm, need a spinlock here I think */
+static const char *current_service = 0;
+
+const char *perfctr_cpu_reserve(const char *service)
+{
+ if( current_service )
+ return current_service;
+ current_service = service;
+ return 0;
+}
+
+static void perfctr_cpu_clear_one(void *ignore)
+{
+ /* PREEMPT note: when called via smp_call_function(),
+ this is in IRQ context with preemption disabled. */
+ perfctr_cpu_clear_counters();
+}
+
+void perfctr_cpu_release(const char *service)
+{
+ if( service != current_service ) {
+ printk(KERN_ERR "%s: attempt by %s to release while reserved by %s\n",
+ __FUNCTION__, service, current_service);
+ } else {
+ /* power down the counters */
+ preempt_disable();
+ invalidate_per_cpu_cache();
+ perfctr_cpu_clear_one(NULL);
+ smp_call_function(perfctr_cpu_clear_one, NULL, 1, 1);
+ preempt_enable();
+ perfctr_cpu_set_ihandler(NULL);
+ current_service = 0;
+ }
+}
diff -uN linux-2.5.45/include/asm-i386/perfctr.h linux-2.5.45.perfctr-3.1/include/asm-i386/perfctr.h
--- linux-2.5.45/include/asm-i386/perfctr.h Thu Jan 1 01:00:00 1970
+++ linux-2.5.45.perfctr-3.1/include/asm-i386/perfctr.h Thu Oct 31 22:36:50 2002
@@ -0,0 +1,192 @@
+/* $Id: perfctr.h,v 1.29 2002/10/31 21:36:50 mikpe Exp $
+ * x86 Performance-Monitoring Counters driver
+ *
+ * Copyright (C) 1999-2002 Mikael Pettersson
+ */
+#ifndef _ASM_I386_PERFCTR_H
+#define _ASM_I386_PERFCTR_H
+
+struct perfctr_cpu_info {
+ unsigned char nrcpus;
+ unsigned char type;
+ unsigned char features;
+ unsigned long khz;
+};
+
+/* type values */
+#define PERFCTR_X86_GENERIC 0 /* any x86 with rdtsc */
+#define PERFCTR_X86_INTEL_P5 1 /* no rdpmc */
+#define PERFCTR_X86_INTEL_P5MMX 2
+#define PERFCTR_X86_INTEL_P6 3
+#define PERFCTR_X86_INTEL_PII 4
+#define PERFCTR_X86_INTEL_PIII 5
+#define PERFCTR_X86_CYRIX_MII 6
+#define PERFCTR_X86_WINCHIP_C6 7 /* no rdtsc */
+#define PERFCTR_X86_WINCHIP_2 8 /* no rdtsc */
+#define PERFCTR_X86_AMD_K7 9
+#define PERFCTR_X86_VIA_C3 10 /* no pmc0 */
+#define PERFCTR_X86_INTEL_P4 11 /* model 0 and 1 */
+#define PERFCTR_X86_INTEL_P4M2 12 /* model 2 and above */
+
+/* features flag bits */
+#define PERFCTR_FEATURE_RDPMC 0x01
+#define PERFCTR_FEATURE_RDTSC 0x02
+#define PERFCTR_FEATURE_PCINT 0x04
+
+struct perfctr_sum_ctrs {
+ unsigned long long tsc;
+ unsigned long long pmc[18];
+};
+
+struct perfctr_low_ctrs {
+ unsigned int tsc;
+ unsigned int pmc[18];
+};
+
+struct perfctr_cpu_control {
+ unsigned int tsc_on;
+ unsigned int nractrs; /* # of a-mode counters */
+ unsigned int nrictrs; /* # of i-mode counters */
+ unsigned int pmc_map[18];
+ unsigned int evntsel[18]; /* one per counter, even on P5 */
+ unsigned int evntsel_aux[18]; /* e.g. P4 ESCR contents */
+ struct {
+ unsigned int pebs_enable; /* for replay tagging */
+ unsigned int pebs_matrix_vert; /* for replay tagging */
+ } p4;
+ int ireset[18]; /* <= 0, for i-mode counters */
+};
+
+struct perfctr_cpu_state {
+ unsigned int cstatus;
+ union {
+ unsigned int p5_cesr;
+ unsigned int id; /* cache owner id */
+ } k1;
+ struct perfctr_sum_ctrs sum;
+ struct perfctr_low_ctrs start;
+ struct perfctr_cpu_control control;
+ struct {
+ unsigned int p4_escr_map[18];
+ const void *isuspend_cpu;
+ } k2;
+};
+
+/* `struct perfctr_cpu_state' binary layout version number */
+#define PERFCTR_CPU_STATE_MAGIC 0x0201 /* 2.1 */
+
+/* cstatus is a re-encoding of control.tsc_on/nractrs/nrictrs
+ which should have less overhead in most cases */
+
+static inline
+unsigned int perfctr_mk_cstatus(unsigned int tsc_on, unsigned int nractrs,
+ unsigned int nrictrs)
+{
+ return (tsc_on<<31) | (nrictrs<<16) | ((nractrs+nrictrs)<<8) | nractrs;
+}
+
+static inline unsigned int perfctr_cstatus_enabled(unsigned int cstatus)
+{
+ return cstatus;
+}
+
+static inline int perfctr_cstatus_has_tsc(unsigned int cstatus)
+{
+ return (int)cstatus < 0; /* test and jump on sign */
+}
+
+static inline unsigned int perfctr_cstatus_nractrs(unsigned int cstatus)
+{
+ return cstatus & 0x7F; /* and with imm8 */
+}
+
+static inline unsigned int perfctr_cstatus_nrctrs(unsigned int cstatus)
+{
+ return (cstatus >> 8) & 0x7F;
+}
+
+static inline unsigned int perfctr_cstatus_has_ictrs(unsigned int cstatus)
+{
+ return cstatus & (0x7F << 16);
+}
+
+/*
+ * 'struct siginfo' support for perfctr overflow signals.
+ * In unbuffered mode, si_code is set to SI_PMC_OVF and a bitmask
+ * describing which perfctrs overflowed is put in si_pmc_ovf_mask.
+ * A bitmask is used since more than one perfctr can have overflowed
+ * by the time the interrupt handler runs.
+ *
+ * glibc's <signal.h> doesn't seem to define __SI_FAULT or __SI_CODE(),
+ * and including <asm/siginfo.h> as well may cause redefinition errors,
+ * so the user and kernel values are different #defines here.
+ */
+#ifdef __KERNEL__
+#define SI_PMC_OVF (__SI_FAULT|'P')
+#else
+#define SI_PMC_OVF ('P')
+#endif
+#define si_pmc_ovf_mask _sifields._pad[0] /* XXX: use an unsigned field later */
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_PERFCTR
+
+/* Driver init/exit. */
+extern int perfctr_cpu_init(void);
+extern void perfctr_cpu_exit(void);
+
+/* CPU info. */
+extern struct perfctr_cpu_info perfctr_cpu_info;
+
+/* CPU type name. */
+extern char *perfctr_cpu_name[];
+
+/* Hardware reservation. */
+extern const char *perfctr_cpu_reserve(const char *service);
+extern void perfctr_cpu_release(const char *service);
+
+/* PRE: state has no running interrupt-mode counters.
+ Check that the new control data is valid.
+ Update the driver's private control data.
+ Returns a negative error code if the control data is invalid. */
+extern int perfctr_cpu_update_control(struct perfctr_cpu_state *state);
+
+/* Read a-mode counters. Subtract from start and accumulate into sums.
+ Must be called with preemption disabled. */
+extern void perfctr_cpu_suspend(struct perfctr_cpu_state *state);
+
+/* Write control registers. Read a-mode counters into start.
+ Must be called with preemption disabled. */
+extern void perfctr_cpu_resume(struct perfctr_cpu_state *state);
+
+/* Perform an efficient combined suspend/resume operation.
+ Must be called with preemption disabled. */
+extern void perfctr_cpu_sample(struct perfctr_cpu_state *state);
+
+/* The type of a perfctr overflow interrupt handler.
+ It will be called in IRQ context, with preemption disabled. */
+typedef void (*perfctr_ihandler_t)(unsigned long pc);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#define PERFCTR_INTERRUPT_SUPPORT 1 /* for higher-level drivers */
+extern void perfctr_cpu_set_ihandler(perfctr_ihandler_t);
+extern void perfctr_cpu_ireload(struct perfctr_cpu_state*);
+extern unsigned int perfctr_cpu_identify_overflow(struct perfctr_cpu_state*);
+#else
+static inline void perfctr_cpu_set_ihandler(perfctr_ihandler_t x) { }
+#endif
+
+#endif /* CONFIG_PERFCTR */
+
+#if defined(CONFIG_PERFCTR) && defined(CONFIG_X86_LOCAL_APIC)
+extern void perfctr_interrupt(void);
+#define perfctr_vector_init() \
+ set_intr_gate(LOCAL_PERFCTR_VECTOR, perfctr_interrupt)
+#else
+#define perfctr_vector_init() do{}while(0)
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_I386_PERFCTR_H */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/