Index: sys/sys/kstat.h =================================================================== RCS file: /cvs/src/sys/sys/kstat.h,v retrieving revision 1.4 diff -u -p -r1.4 kstat.h --- sys/sys/kstat.h 16 Nov 2023 02:45:20 -0000 1.4 +++ sys/sys/kstat.h 22 Nov 2023 03:58:04 -0000 @@ -93,6 +93,8 @@ enum kstat_kv_unit { KSTAT_KV_U_PACKETS, /* packets */ KSTAT_KV_U_BYTES, /* bytes */ KSTAT_KV_U_CYCLES, /* cycles */ + KSTAT_KV_U_INSTR, /* instructions */ + KSTAT_KV_U_UJOULES, /* uJoules */ }; struct kstat_kv { Index: sys/arch/amd64/amd64/cpumon.c =================================================================== RCS file: sys/arch/amd64/amd64/cpumon.c diff -N sys/arch/amd64/amd64/cpumon.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/arch/amd64/amd64/cpumon.c 22 Nov 2023 03:58:04 -0000 @@ -0,0 +1,826 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2022, 2023 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "kstat.h" + +#if NKSTAT == 0 +#error cpumon(4) requires kstat(4) +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MSR_RAPL_PWR_UNIT 0xc0010299 +#define MSR_RAPL_PWR_UNIT_ESU_SHIFT 8 +#define MSR_RAPL_PWR_UNIT_ESU_MASK 0x1f +#define MSR_CORE_ENERGY_STATE 0xc001029a +#define MSR_PKG_ENERGY_STATE 0xc001029b + +#define CPUMON_MSR_AMD_RAPL_PWR_UNIT 0xC0010299 +#define CPUMON_MSR_AMD_RAPL_PWR_UNIT_ESU_SHIFT 8 +#define CPUMON_MSR_AMD_SR_RAPL_PWR_UNIT_ESU_MASK 0x1f +#define CPUMON_MSR_AMD_CORE_ENERGY_STAT 0xC001029A +#define CPUMON_MSR_AMD_PKG_ENERGY_STAT 0xC001029B + +struct cpumon_rapl { + uint64_t rapl_energy; /* accumulator */ + uint32_t rapl_energy_prev; + unsigned int rapl_energy_unit; + uint32_t rapl_energy_msr; +}; + +enum cpumon_core_map { + CPUMON_CORE_MAP_TSC, + CPUMON_CORE_MAP_EFF_PERF, + CPUMON_CORE_MAP_SMI, + CPUMON_CORE_MAP_IRPERF, + + CPUMON_CORE_MAP_TEMP, + CPUMON_CORE_MAP_RAPL, + + CPUMON_CORE_MAP_COUNT +}; + +struct cpumon_core { + struct cpumon_softc *c_sc; + struct cpu_info *c_ci; + struct kstat *c_ks; + + TAILQ_ENTRY(cpumon_core) c_entry; + + unsigned int c_nkvs; + int8_t c_map[CPUMON_CORE_MAP_COUNT]; + + int c_temp_max; + + struct cpumon_rapl c_rapl; + struct task c_rapl_xcall; +}; + +TAILQ_HEAD(cpumon_cores, cpumon_core); + +enum cpumon_pkg_map { + CPUMON_PKG_MAP_RAPL_PP0, + CPUMON_PKG_MAP_RAPL_PKG, + CPUMON_PKG_MAP_RAPL_RAM, + CPUMON_PKG_MAP_RAPL_PP1, + CPUMON_PKG_MAP_RAPL_PSYS, + + CPUMON_PKG_MAP_COUNT +}; + +static const char *cpump_pkg_names[] = { + [CPUMON_PKG_MAP_RAPL_PP0] = "pp0", + [CPUMON_PKG_MAP_RAPL_PKG] = "pkg", + [CPUMON_PKG_MAP_RAPL_RAM] = "ram", + [CPUMON_PKG_MAP_RAPL_PP1] = "pp1", + [CPUMON_PKG_MAP_RAPL_PSYS] = "psys", +}; + +struct cpumon_pkg { + struct cpumon_softc *p_sc; + struct cpu_info *p_ci; + struct kstat *p_ks; + + TAILQ_ENTRY(cpumon_pkg) p_entry; + + unsigned int p_nkvs; + int8_t p_map[CPUMON_PKG_MAP_COUNT]; + + struct cpumon_rapl p_rapl[CPUMON_PKG_MAP_COUNT]; + struct task p_rapl_xcall; +}; + +TAILQ_HEAD(cpumon_pkgs, cpumon_pkg); + +struct cpumon_softc { + struct device sc_dev; + struct task sc_deferred; + + struct cpumon_cores sc_cores; + struct cpumon_pkgs sc_pkgs; + + /* used by the ticks below to wait for all cores/pkgs to read stuff */ + struct refcnt sc_rapl_refs; + + struct timeout sc_core_rapl_tick; + struct timeout sc_pkg_rapl_tick; +}; + +static int cpumon_match(struct device *, void *, void *); +static void cpumon_attach(struct device *, struct device *, void *); + +struct cfdriver cpumon_cd = { + NULL, "cpumon", DV_DULL, CD_SKIPHIBERNATE +}; + +const struct cfattach cpumon_ca = { + sizeof(struct cpumon_softc), cpumon_match, cpumon_attach, NULL, NULL +}; + +static void cpumon_deferred(void *); +static struct cpumon_core * + cpumon_attach_core(struct cpumon_softc *, struct cpu_info *); +static struct cpumon_pkg * + cpumon_attach_pkg(struct cpumon_softc *, struct cpu_info *); + +static void cpumon_core_rapl_tick(void *); +static void cpumon_pkg_rapl_tick(void *); + +static int +cpumon_match(struct device *parent, void *match, void *aux) +{ + const char **busname = (const char **)aux; + + if (strcmp(*busname, cpumon_cd.cd_name) != 0) + return (0); + + return (1); +} + +static void +cpumon_attach(struct device *parent, struct device *self, void *aux) +{ + struct cpumon_softc *sc = (struct cpumon_softc *)self; + + printf("\n"); + + task_set(&sc->sc_deferred, cpumon_deferred, sc); + TAILQ_INIT(&sc->sc_cores); + TAILQ_INIT(&sc->sc_pkgs); + timeout_set_proc(&sc->sc_core_rapl_tick, cpumon_core_rapl_tick, sc); + timeout_set_proc(&sc->sc_pkg_rapl_tick, cpumon_pkg_rapl_tick, sc); + + task_add(systqmp, &sc->sc_deferred); +} + +static inline uint32_t +cpumon_rapl_read_msr(const struct cpumon_rapl *rapl) +{ + return (rdmsr(rapl->rapl_energy_msr)); +} + +static void +cpumon_core_rapl_xcall(void *arg) +{ + struct cpumon_core *c = arg; + struct cpumon_softc *sc = c->c_sc; + struct cpumon_rapl *rapl = &c->c_rapl; + uint32_t energy_now; + uint32_t diff; + + energy_now = cpumon_rapl_read_msr(rapl); + diff = energy_now - rapl->rapl_energy_prev; + + rapl->rapl_energy_prev = energy_now; + rapl->rapl_energy += diff; + + refcnt_rele_wake(&sc->sc_rapl_refs); +} + +static void +cpumon_pkg_rapl_xcall(void *arg) +{ + struct cpumon_pkg *p = arg; + struct cpumon_softc *sc = p->p_sc; + struct cpumon_rapl *rapl; + uint32_t energy_now; + uint32_t diff; + size_t i; + + for (i = 0; i < nitems(p->p_rapl); i++) { + if (p->p_map[i] == 0) + continue; + + rapl = &p->p_rapl[i]; + + energy_now = cpumon_rapl_read_msr(rapl); + diff = energy_now - rapl->rapl_energy_prev; + + rapl->rapl_energy_prev = energy_now; + rapl->rapl_energy += diff; + } + + refcnt_rele_wake(&sc->sc_rapl_refs); +} + +static uint64_t +cpumon_rapl_read(struct cpumon_rapl *rapl, uint32_t energy_now) +{ + uint32_t diff = energy_now - rapl->rapl_energy_prev; + uint64_t energy = rapl->rapl_energy + diff; + + rapl->rapl_energy_prev = energy_now; + rapl->rapl_energy = energy; + + /* XXX i feel like this will overflow */ + return ((energy * 1000000) >> rapl->rapl_energy_unit); +} + +static void +cpumon_probe_core_effperf(struct cpumon_core *c) +{ + uint32_t eax, ebx, ecx, edx; + + CPUID(0x06, eax, ebx, ecx, edx); + + if (ecx & (1 << 0)) { + c->c_map[CPUMON_CORE_MAP_EFF_PERF] = c->c_nkvs; + c->c_nkvs += 2; + } +} + +static void +cpumon_probe_core_intel(struct cpumon_core *c) +{ + struct cpu_info *ci = c->c_ci; + + if (cpuid_level >= 0x06) { + cpumon_probe_core_effperf(c); + + switch (ci->ci_model) { + case 0x45: /* Haswell mobile */ + c->c_map[CPUMON_CORE_MAP_SMI] = c->c_nkvs; + c->c_nkvs += 1; + } + } + + if (ISSET(ci->ci_feature_tpmflags, TPM_SENSOR)) { + c->c_map[CPUMON_CORE_MAP_TEMP] = c->c_nkvs; + c->c_nkvs += 1; + + c->c_temp_max = 100; + + /* Only some Core family chips have MSR_TEMPERATURE_TARGET. */ + if (ci->ci_model == 0x0e && + (rdmsr(MSR_TEMPERATURE_TARGET_UNDOCUMENTED) & + MSR_TEMPERATURE_TARGET_LOW_BIT_UNDOCUMENTED)) + c->c_temp_max = 85; + + /* + * Newer CPUs can tell you what their max temperature is. + * See: '64-ia-32-architectures-software-developer- + * vol-3c-part-3-manual.pdf' + */ + if (ci->ci_model > 0x17 && ci->ci_model != 0x1c && + ci->ci_model != 0x26 && ci->ci_model != 0x27 && + ci->ci_model != 0x35 && ci->ci_model != 0x36) + c->c_temp_max = MSR_TEMPERATURE_TARGET_TJMAX( + rdmsr(MSR_TEMPERATURE_TARGET)); + } +} + +static void +cpumon_probe_core_amd(struct cpumon_core *c) +{ + cpumon_probe_core_effperf(c); + + if (c->c_ci->ci_family >= 0x17) { + uint32_t eax, ebx, ecx, edx; + + CPUID(0x80000008, eax, ebx, ecx, edx); + if (ebx & (1 << 1)) { + c->c_map[CPUMON_CORE_MAP_IRPERF] = c->c_nkvs; + c->c_nkvs += 1; + } + + CPUID(0x80000007, eax, ebx, ecx, edx); + if (edx & (1 << 14)) { + c->c_map[CPUMON_CORE_MAP_RAPL] = c->c_nkvs; + c->c_nkvs += 1; + } + } +} + +static void +cpumon_deferred(void *arg) +{ + struct cpumon_softc *sc = arg; + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + struct cpumon_core *c; + int rapl = 0; + + CPU_INFO_FOREACH(cii, ci) { + sched_peg_curproc(ci); + + c = cpumon_attach_core(sc, ci); + if (c && c->c_map[CPUMON_CORE_MAP_RAPL]) + rapl = 1; + + cpumon_attach_pkg(sc, ci); + } + + atomic_clearbits_int(&curproc->p_flag, P_CPUPEG); + + if (rapl) + timeout_add_sec(&sc->sc_core_rapl_tick, 53); + + if (!TAILQ_EMPTY(&sc->sc_pkgs)) + timeout_add_sec(&sc->sc_pkg_rapl_tick, 5); +} + +static void +cpumon_core_rapl_tick(void *arg) +{ + struct cpumon_softc *sc = arg; + struct cpumon_core *c; + + refcnt_init(&sc->sc_rapl_refs); + + TAILQ_FOREACH(c, &sc->sc_cores, c_entry) { + if (c->c_map[CPUMON_CORE_MAP_RAPL] == 0) { + /* is this even possible? */ + continue; + } + + refcnt_take(&sc->sc_rapl_refs); + cpu_xcall(c->c_ci, &c->c_rapl_xcall); + } + + refcnt_finalize(&sc->sc_rapl_refs, "raplcore"); + + /* this doesnt have to be accurate */ + timeout_add_sec(&sc->sc_core_rapl_tick, 53); +} + +static void +cpumon_pkg_rapl_tick(void *arg) +{ + struct cpumon_softc *sc = arg; + struct cpumon_pkg *p; + + refcnt_init(&sc->sc_rapl_refs); + + TAILQ_FOREACH(p, &sc->sc_pkgs, p_entry) { + refcnt_take(&sc->sc_rapl_refs); + cpu_xcall(p->p_ci, &p->p_rapl_xcall); + } + + refcnt_finalize(&sc->sc_rapl_refs, "raplpkg"); + + /* this doesnt have to be accurate */ + timeout_add_sec(&sc->sc_core_rapl_tick, 7); +} + +struct cpumon_xcall { + struct kstat *cx_ks; + struct cond cx_c; +}; + +static void +cpumon_read_core_xcall(void *arg) +{ + struct cpumon_xcall *cx = arg; + struct kstat *ks = cx->cx_ks; + struct kstat_kv *kvs = ks->ks_data; + struct cpumon_core *c = ks->ks_softc; + unsigned long s; + uint32_t energy_now; + int idx, rapl; + + /* this isn't timing critical */ + idx = c->c_map[CPUMON_CORE_MAP_TEMP]; + if (idx) { + uint64_t msr = rdmsr(MSR_THERM_STATUS); + + if (msr & MSR_THERM_STATUS_VALID_BIT) { + uint64_t v; + + v = c->c_temp_max - MSR_THERM_STATUS_TEMP(msr); + /* micro degrees */ + v *= 1000000; + /* kelvin */ + v += 273150000; + + kvs[idx].kv_type = KSTAT_KV_T_TEMP; + kstat_kv_temp(&kvs[idx]) = v; + } else + kvs[idx].kv_type = KSTAT_KV_T_NULL; + } + + s = intr_disable(); + idx = c->c_map[CPUMON_CORE_MAP_TSC]; + if (idx) + kstat_kv_u64(&kvs[idx]) = rdtsc_lfence(); + + idx = c->c_map[CPUMON_CORE_MAP_EFF_PERF]; + if (idx) { + kstat_kv_u64(&kvs[idx + 0]) = rdmsr(0xe7); + kstat_kv_u64(&kvs[idx + 1]) = rdmsr(0xe8); + } + + idx = c->c_map[CPUMON_CORE_MAP_SMI]; + if (idx) + kstat_kv_u32(&kvs[idx]) = rdmsr(0x34); + + idx = c->c_map[CPUMON_CORE_MAP_IRPERF]; + if (idx) + kstat_kv_u64(&kvs[idx]) = rdmsr(0xe7); + + rapl = c->c_map[CPUMON_CORE_MAP_RAPL]; + if (rapl) + energy_now = cpumon_rapl_read_msr(&c->c_rapl); + + nanouptime(&ks->ks_updated); + intr_restore(s); + + if (rapl) { + kstat_kv_u64(&kvs[rapl]) = + cpumon_rapl_read(&c->c_rapl, energy_now); + } + + cond_signal(&cx->cx_c); +} + +static int +cpumon_read_core(struct kstat *ks) +{ + struct timespec now, diff; + + /* rate limit the updates to roughly twice a second */ + getnanouptime(&now); + timespecsub(&now, &ks->ks_updated, &diff); + if (diff.tv_sec > 0 || diff.tv_nsec > 500000000) { + struct cpumon_xcall cx = { ks, COND_INITIALIZER() }; + struct task t = TASK_INITIALIZER(cpumon_read_core_xcall, &cx); + struct cpumon_core *c = ks->ks_softc; + + cpu_xcall(c->c_ci, &t); + + cond_wait(&cx.cx_c, "cpumonc"); + } + + return (0); +} + +static struct cpumon_core * +cpumon_attach_core(struct cpumon_softc *sc, struct cpu_info *ci) +{ + struct kstat *ks; + struct kstat_kv *kvs; + struct cpumon_core *c; + int idx; + + TAILQ_FOREACH(c, &sc->sc_cores, c_entry) { + if (ci->ci_pkg_id == c->c_ci->ci_pkg_id && + ci->ci_core_id == c->c_ci->ci_core_id) { + /* core is already being monitored */ + + if (ci->ci_smt_id < c->c_ci->ci_smt_id) { + /* prefer low threads */ + c->c_ci = ci; + } + + return (NULL); + } + } + + ks = kstat_create("cpu-core", ci->ci_pkg_id << 24 | ci->ci_core_id, + "cpumon", 0, KSTAT_T_KV, 0); + if (ks == NULL) { + printf("unable to create cpu-core kstat for pkg %u core %d\n", + ci->ci_pkg_id, ci->ci_core_id); + return (NULL); + } + + c = malloc(sizeof(*c), M_DEVBUF, M_WAITOK|M_ZERO); + c->c_sc = sc; + c->c_ci = ci; + c->c_ks = ks; + c->c_nkvs = 2; /* pkg and core ids */ + + /* assume we have tsc */ + c->c_map[CPUMON_CORE_MAP_TSC] = c->c_nkvs; + c->c_nkvs += 1; + + if (strcmp(cpu_vendor, "GenuineIntel") == 0) + cpumon_probe_core_intel(c); + else if (strcmp(cpu_vendor, "AuthenticAMD") == 0) + cpumon_probe_core_amd(c); + + kvs = mallocarray(c->c_nkvs, sizeof(*kvs), M_DEVBUF, M_WAITOK|M_ZERO); + + kstat_kv_init(&kvs[0], "package", KSTAT_KV_T_UINT32); + kstat_kv_u32(&kvs[0]) = ci->ci_pkg_id; + kstat_kv_init(&kvs[1], "core", KSTAT_KV_T_UINT32); + kstat_kv_u32(&kvs[1]) = ci->ci_core_id; + + idx = c->c_map[CPUMON_CORE_MAP_TSC]; + if (idx) { + kstat_kv_unit_init(&kvs[idx], "tsc", + KSTAT_KV_T_COUNTER64, KSTAT_KV_U_CYCLES); + } + + idx = c->c_map[CPUMON_CORE_MAP_EFF_PERF]; + if (idx) { + kstat_kv_init(&kvs[idx + 0], "mperf", KSTAT_KV_T_COUNTER64); + kstat_kv_init(&kvs[idx + 1], "aperf", KSTAT_KV_T_COUNTER64); + } + + idx = c->c_map[CPUMON_CORE_MAP_SMI]; + if (idx) + kstat_kv_init(&kvs[idx], "smi", KSTAT_KV_T_COUNTER32); + + idx = c->c_map[CPUMON_CORE_MAP_IRPERF]; + if (idx) { + uint64_t msr; + + msr = rdmsr(0xC0010015); + SET(msr, (1 << 30)); + wrmsr(0xC0010015, msr); + + kstat_kv_unit_init(&kvs[idx], "irperf", + KSTAT_KV_T_COUNTER64, KSTAT_KV_U_INSTR); + } + + idx = c->c_map[CPUMON_CORE_MAP_TEMP]; + if (idx) + kstat_kv_init(&kvs[idx], "temperature", KSTAT_KV_T_TEMP); + + idx = c->c_map[CPUMON_CORE_MAP_RAPL]; + if (idx) { + uint64_t rapl_pwr_unit; + unsigned int unit; + + rapl_pwr_unit = rdmsr(CPUMON_MSR_AMD_RAPL_PWR_UNIT); + unit = rapl_pwr_unit >> CPUMON_MSR_AMD_RAPL_PWR_UNIT_ESU_SHIFT; + unit &= CPUMON_MSR_AMD_SR_RAPL_PWR_UNIT_ESU_MASK; + + task_set(&c->c_rapl_xcall, cpumon_core_rapl_xcall, c); + + c->c_rapl.rapl_energy_msr = CPUMON_MSR_AMD_CORE_ENERGY_STAT; + c->c_rapl.rapl_energy_prev = rdmsr(c->c_rapl.rapl_energy_msr); + c->c_rapl.rapl_energy_unit = unit; + + kstat_kv_unit_init(&kvs[idx], "energy", + KSTAT_KV_T_COUNTER64, KSTAT_KV_U_UJOULES); + } + + ks->ks_data = kvs; + ks->ks_datalen = c->c_nkvs * sizeof(*kvs); + ks->ks_read = cpumon_read_core; + ks->ks_softc = c; + + kstat_install(ks); + + TAILQ_INSERT_TAIL(&sc->sc_cores, c, c_entry); + + return (c); +} + +static void +cpumon_read_pkg_xcall(void *arg) +{ + struct cpumon_xcall *cx = arg; + struct kstat *ks = cx->cx_ks; + struct kstat_kv *kvs = ks->ks_data; + struct cpumon_pkg *p = ks->ks_softc; + unsigned long s; + uint32_t energy_now[nitems(p->p_map)]; + size_t i; + int idx; + + s = intr_disable(); + for (i = 0; i < nitems(p->p_map); i++) { + if (p->p_map[i] == 0) + continue; + + energy_now[i] = cpumon_rapl_read_msr(&p->p_rapl[i]); + } + + nanouptime(&ks->ks_updated); + intr_restore(s); + + for (i = 0; i < nitems(p->p_map); i++) { + idx = p->p_map[i]; + if (idx == 0) + continue; + + energy_now[i] = cpumon_rapl_read_msr(&p->p_rapl[i]); + kstat_kv_u64(&kvs[idx]) = + cpumon_rapl_read(&p->p_rapl[i], energy_now[i]); + } + + cond_signal(&cx->cx_c); +} + +static int +cpumon_read_pkg(struct kstat *ks) +{ + struct timespec now, diff; + + /* rate limit the updates to roughly twice a second */ + getnanouptime(&now); + timespecsub(&now, &ks->ks_updated, &diff); + if (diff.tv_sec > 0 || diff.tv_nsec > 500000000) { + struct cpumon_xcall cx = { ks, COND_INITIALIZER() }; + struct task t = TASK_INITIALIZER(cpumon_read_pkg_xcall, &cx); + struct cpumon_pkg *p = ks->ks_softc; + + cpu_xcall(p->p_ci, &t); + + cond_wait(&cx.cx_c, "cpumonp"); + } + + return (0); +} + +static uint32_t cpumon_intel_rapl_msrs[] = { + [CPUMON_PKG_MAP_RAPL_PP0] = 0x00000639, + [CPUMON_PKG_MAP_RAPL_PKG] = 0x00000611, + [CPUMON_PKG_MAP_RAPL_RAM] = 0x00000619, + [CPUMON_PKG_MAP_RAPL_PP1] = 0x00000641, + [CPUMON_PKG_MAP_RAPL_PSYS] = 0x0000064D, +}; + +static int +cpumon_probe_pkg_intel(struct cpumon_pkg *p) +{ + struct cpu_info *ci = p->p_ci; + uint64_t rapl_pwr_unit; + unsigned int unit; + struct cpumon_rapl *rapl; + int rv = 0; + + if (ci->ci_family < 0x06) + return (0); + + switch (ci->ci_model) { + case 0x45: /* Haswell mobile */ + rapl_pwr_unit = rdmsr(0x00000606); + unit = rapl_pwr_unit >> + CPUMON_MSR_AMD_RAPL_PWR_UNIT_ESU_SHIFT; + unit &= CPUMON_MSR_AMD_SR_RAPL_PWR_UNIT_ESU_MASK; + + p->p_map[CPUMON_PKG_MAP_RAPL_PP0] = p->p_nkvs; + p->p_nkvs += 1; + + rapl = &p->p_rapl[CPUMON_PKG_MAP_RAPL_PP0]; + rapl->rapl_energy_msr = + cpumon_intel_rapl_msrs[CPUMON_PKG_MAP_RAPL_PP0]; + rapl->rapl_energy_prev = rdmsr(rapl->rapl_energy_msr); + rapl->rapl_energy_unit = unit; + + p->p_map[CPUMON_PKG_MAP_RAPL_PKG] = p->p_nkvs; + p->p_nkvs += 1; + + rapl = &p->p_rapl[CPUMON_PKG_MAP_RAPL_PKG]; + rapl->rapl_energy_msr = + cpumon_intel_rapl_msrs[CPUMON_PKG_MAP_RAPL_PKG]; + rapl->rapl_energy_prev = rdmsr(rapl->rapl_energy_msr); + rapl->rapl_energy_unit = unit; + + p->p_map[CPUMON_PKG_MAP_RAPL_RAM] = p->p_nkvs; + p->p_nkvs += 1; + + rapl = &p->p_rapl[CPUMON_PKG_MAP_RAPL_RAM]; + rapl->rapl_energy_msr = + cpumon_intel_rapl_msrs[CPUMON_PKG_MAP_RAPL_RAM]; + rapl->rapl_energy_prev = rdmsr(rapl->rapl_energy_msr); + rapl->rapl_energy_unit = unit; + + p->p_map[CPUMON_PKG_MAP_RAPL_PP1] = p->p_nkvs; + p->p_nkvs += 1; + + rapl = &p->p_rapl[CPUMON_PKG_MAP_RAPL_PP1]; + rapl->rapl_energy_msr = + cpumon_intel_rapl_msrs[CPUMON_PKG_MAP_RAPL_PP1]; + rapl->rapl_energy_prev = rdmsr(rapl->rapl_energy_msr); + rapl->rapl_energy_unit = unit; + + rv = 1; + } + + return (rv); +} + +static int +cpumon_probe_pkg_amd(struct cpumon_pkg *p) +{ + uint64_t rapl_pwr_unit; + unsigned int unit; + struct cpumon_rapl *rapl; + int rv = 0; + + if (p->p_ci->ci_family >= 0x17) { + uint32_t eax, ebx, ecx, edx; + + CPUID(0x80000007, eax, ebx, ecx, edx); + if (edx & (1 << 14)) { + p->p_map[CPUMON_PKG_MAP_RAPL_PKG] = p->p_nkvs; + p->p_nkvs += 1; + + rapl_pwr_unit = rdmsr(CPUMON_MSR_AMD_RAPL_PWR_UNIT); + unit = rapl_pwr_unit >> + CPUMON_MSR_AMD_RAPL_PWR_UNIT_ESU_SHIFT; + unit &= CPUMON_MSR_AMD_SR_RAPL_PWR_UNIT_ESU_MASK; + + rapl = &p->p_rapl[CPUMON_PKG_MAP_RAPL_PKG]; + + rapl->rapl_energy_msr = CPUMON_MSR_AMD_PKG_ENERGY_STAT; + rapl->rapl_energy_prev = rdmsr(rapl->rapl_energy_msr); + rapl->rapl_energy_unit = unit; + + rv = 1; + } + } + + return (rv); +} + +static struct cpumon_pkg * +cpumon_attach_pkg(struct cpumon_softc *sc, struct cpu_info *ci) +{ + struct kstat *ks; + struct kstat_kv *kvs; + struct cpumon_pkg *p; + int rv = 0; + size_t i; + int idx; + + TAILQ_FOREACH(p, &sc->sc_pkgs, p_entry) { + if (ci->ci_pkg_id == p->p_ci->ci_pkg_id) { + /* pkg is already being monitored */ + + return (NULL); + } + } + + p = malloc(sizeof(*p), M_DEVBUF, M_WAITOK|M_ZERO); + p->p_sc = sc; + p->p_ci = ci; + p->p_nkvs = 1; /* pkg id */ + task_set(&p->p_rapl_xcall, cpumon_pkg_rapl_xcall, p); + + if (strcmp(cpu_vendor, "GenuineIntel") == 0) + rv = cpumon_probe_pkg_intel(p); + else if (strcmp(cpu_vendor, "AuthenticAMD") == 0) + rv = cpumon_probe_pkg_amd(p); + + if (rv == 0) { + free(p, M_DEVBUF, sizeof(*p)); + return (NULL); + } + + ks = kstat_create("cpu-pkg", ci->ci_pkg_id, "cpumon", 0, KSTAT_T_KV, 0); + if (ks == NULL) { + printf("unable to create cpu-pkg kstat for pkg %u\n", + ci->ci_pkg_id); + return (NULL); + } + + kvs = mallocarray(p->p_nkvs, sizeof(*kvs), M_DEVBUF, M_WAITOK|M_ZERO); + + kstat_kv_init(&kvs[0], "package", KSTAT_KV_T_UINT32); + kstat_kv_u32(&kvs[0]) = ci->ci_pkg_id; + + for (i = 0; i < nitems(p->p_map); i++) { + idx = p->p_map[i]; + if (idx == 0) + continue; + + kstat_kv_unit_init(&kvs[idx], cpump_pkg_names[i], + KSTAT_KV_T_COUNTER64, KSTAT_KV_U_UJOULES); + } + + ks->ks_data = kvs; + ks->ks_datalen = p->p_nkvs * sizeof(*kvs); + ks->ks_read = cpumon_read_pkg; + ks->ks_softc = p; + + kstat_install(ks); + + TAILQ_INSERT_TAIL(&sc->sc_pkgs, p, p_entry); + + return (p); +} Index: sys/arch/amd64/amd64/intr.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/intr.c,v retrieving revision 1.55 diff -u -p -r1.55 intr.c --- sys/arch/amd64/amd64/intr.c 28 Dec 2020 14:23:30 -0000 1.55 +++ sys/arch/amd64/amd64/intr.c 22 Nov 2023 03:58:04 -0000 @@ -552,7 +552,10 @@ struct intrhand fake_softclock_intrhand; struct intrhand fake_softnet_intrhand; struct intrhand fake_softtty_intrhand; struct intrhand fake_timer_intrhand; +#ifdef MULTIPROCESSOR struct intrhand fake_ipi_intrhand; +struct intrhand fake_xcall_intrhand; +#endif #if NXEN > 0 struct intrhand fake_xen_intrhand; #endif @@ -619,6 +622,17 @@ cpu_intr_init(struct cpu_info *ci) isp->is_handlers = &fake_ipi_intrhand; isp->is_pic = &local_pic; ci->ci_isources[LIR_IPI] = isp; + + isp = malloc(sizeof (struct intrsource), M_DEVBUF, M_NOWAIT|M_ZERO); + if (isp == NULL) + panic("can't allocate fixed interrupt source"); + isp->is_recurse = Xxcallintr; + isp->is_resume = Xxcallintr; + fake_xcall_intrhand.ih_level = IPL_SOFTCLOCK; + fake_xcall_intrhand.ih_flags = IPL_MPSAFE; + isp->is_handlers = &fake_xcall_intrhand; + isp->is_pic = &local_pic; + ci->ci_isources[SIR_XCALL] = isp; #endif #if NXEN > 0 isp = malloc(sizeof (struct intrsource), M_DEVBUF, M_NOWAIT|M_ZERO); Index: sys/arch/amd64/amd64/ipifuncs.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/ipifuncs.c,v retrieving revision 1.37 diff -u -p -r1.37 ipifuncs.c --- sys/arch/amd64/amd64/ipifuncs.c 7 Aug 2022 23:56:06 -0000 1.37 +++ sys/arch/amd64/amd64/ipifuncs.c 22 Nov 2023 03:58:04 -0000 @@ -61,6 +61,7 @@ void x86_64_ipi_nop(struct cpu_info *); void x86_64_ipi_halt(struct cpu_info *); void x86_64_ipi_wbinvd(struct cpu_info *); +void x86_64_ipi_xcall(struct cpu_info *); #if NVMM > 0 void x86_64_ipi_vmclear_vmm(struct cpu_info *); @@ -108,6 +109,7 @@ void (*ipifunc[X86_NIPI])(struct cpu_inf NULL, #endif x86_64_ipi_wbinvd, + x86_64_ipi_xcall, }; void @@ -166,3 +168,13 @@ x86_64_ipi_wbinvd(struct cpu_info *ci) { wbinvd(); } + +void +x86_64_ipi_xcall(struct cpu_info *ci) +{ + /* + * this is an inlining of softintr() because we already have + * curcpu() and the SIR_XCALL bit to set. + */ + x86_atomic_setbits_u64(&ci->ci_ipending, 1UL << SIR_XCALL); +}; Index: sys/arch/amd64/amd64/mainbus.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/mainbus.c,v retrieving revision 1.52 diff -u -p -r1.52 mainbus.c --- sys/arch/amd64/amd64/mainbus.c 21 Feb 2022 11:03:39 -0000 1.52 +++ sys/arch/amd64/amd64/mainbus.c 22 Nov 2023 03:58:04 -0000 @@ -49,6 +49,7 @@ #include "bios.h" #include "mpbios.h" #include "vmm.h" +#include "cpumon.h" #include "pvbus.h" #include "efifb.h" @@ -253,6 +254,13 @@ mainbus_attach(struct device *parent, st if (isa_has_been_seen == 0) config_found(self, &mba_iba, mainbus_print); #endif + +#if NCPUMON > 0 + if (ISSET(cpu_info_primary.ci_feature_flags, CPUID_TSC)) { + mba.mba_busname = "cpumon"; + config_found(self, &mba.mba_busname, mainbus_print); + } +#endif /* NCPUMON > 0 */ #if NVMM > 0 if (vmm_enabled()) { Index: sys/arch/amd64/amd64/softintr.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/softintr.c,v retrieving revision 1.10 diff -u -p -r1.10 softintr.c --- sys/arch/amd64/amd64/softintr.c 11 Sep 2020 09:27:09 -0000 1.10 +++ sys/arch/amd64/amd64/softintr.c 22 Nov 2023 03:58:04 -0000 @@ -38,6 +38,9 @@ #include #include +#include +#include + #include #include @@ -169,3 +172,58 @@ softintr_disestablish(void *arg) free(sih, M_DEVBUF, sizeof(*sih)); } + +void +#ifdef MULTIPROCESSOR +cpu_xcall_self(struct task *t) +#else +cpu_xcall(struct cpu_info *ci, struct task *t) +#endif +{ + int s = splsoftclock(); + (*t->t_func)(t->t_arg); + splx(s); +} + +#ifdef MULTIPROCESSOR +void +cpu_xcall(struct cpu_info *ci, struct task *t) +{ + size_t i; + + if (ci == curcpu()) { + /* execute the task immediately on the local cpu */ + cpu_xcall_self(t); + return; + } + + for (;;) { + for (i = 0; i < nitems(ci->ci_xcalls); i++) { + if (atomic_cas_ptr(&ci->ci_xcalls[i], + NULL, t) == NULL) { + /* membar_producer(); */ + x86_send_ipi(ci, X86_IPI_XCALL); + return; + } + } + + CPU_BUSY_CYCLE(); + } +} + +void +cpu_xcall_dispatch(void) +{ + struct cpu_info *ci = curcpu(); + struct task *t; + size_t i; + + for (i = 0; i < nitems(ci->ci_xcalls); i++) { + t = ci->ci_xcalls[i]; + if (t != NULL) { + ci->ci_xcalls[i] = NULL; + (*t->t_func)(t->t_arg); + } + } +} +#endif /* MULTIPROCESSOR */ Index: sys/arch/amd64/amd64/vector.S =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/vector.S,v retrieving revision 1.94 diff -u -p -r1.94 vector.S --- sys/arch/amd64/amd64/vector.S 31 Jul 2023 04:01:07 -0000 1.94 +++ sys/arch/amd64/amd64/vector.S 22 Nov 2023 03:58:04 -0000 @@ -1312,3 +1312,17 @@ KIDTVEC(softclock) jmp retpoline_r13 CODEPATCH_END(CPTAG_RETPOLINE_R13) END(Xsoftclock) + +#ifdef MULTIPROCESSOR +KIDTVEC(xcallintr) + endbr64 + movl $IPL_SOFTCLOCK, CPUVAR(ILEVEL) + sti + incl CPUVAR(IDEPTH) + call _C_LABEL(cpu_xcall_dispatch) + decl CPUVAR(IDEPTH) + CODEPATCH_START + jmp retpoline_r13 + CODEPATCH_END(CPTAG_RETPOLINE_R13) +END(Xcallintr) +#endif /* MULTIPROCESSOR */ Index: sys/arch/amd64/amd64/identcpu.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v retrieving revision 1.136 diff -u -p -r1.136 identcpu.c --- sys/arch/amd64/amd64/identcpu.c 9 Aug 2023 00:01:44 -0000 1.136 +++ sys/arch/amd64/amd64/identcpu.c 22 Nov 2023 03:58:04 -0000 @@ -36,11 +36,15 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include "kstat.h" + #include #include #include #include #include +#include +#include #include "vmm.h" #include "pvbus.h" @@ -61,6 +65,10 @@ void tsc_timecounter_init(struct cpu_inf void cpu_check_vmm_cap(struct cpu_info *); #endif /* NVMM > 0 */ +#if NKSTAT > 0 +static void cpu_kstat_attach(struct cpu_info *); +#endif + /* sysctl wants this. */ char cpu_model[48]; int cpuspeed; @@ -832,6 +840,10 @@ identifycpu(struct cpu_info *ci) sensor_attach(&ci->ci_sensordev, &ci->ci_hz_sensor); #endif } + +#if NKSTAT > 0 + cpu_kstat_attach(ci); +#endif } #ifndef SMALL_KERNEL @@ -1109,3 +1121,68 @@ cpu_check_vmm_cap(struct cpu_info *ci) } } #endif /* NVMM > 0 */ + +#if NKSTAT > 0 +struct cpu_kstat { + struct kstat_kv ck_package; + struct kstat_kv ck_core; + struct kstat_kv ck_thread; + + struct kstat_kv ck_vendor; + struct kstat_kv ck_brand; + char ck_brand_str[48]; + struct kstat_kv ck_family; + struct kstat_kv ck_model; + struct kstat_kv ck_stepping; +}; + +static void +cpu_kstat_attach(struct cpu_info *ci) +{ + struct kstat *ks; + struct cpu_kstat *ck; + + ks = kstat_create(ci->ci_dev->dv_xname, 0, "mach", 0, KSTAT_T_KV, 0); + if (ks == NULL) { + printf("%s: unable to attach kstat\n", ci->ci_dev->dv_xname); + return; + } + + ck = malloc(sizeof(*ck), M_DEVBUF, M_WAITOK|M_ZERO); + + kstat_kv_init(&ck->ck_package, "package", KSTAT_KV_T_UINT32); + kstat_kv_u32(&ck->ck_package) = ci->ci_pkg_id; + kstat_kv_init(&ck->ck_core, "core", KSTAT_KV_T_UINT32); + kstat_kv_u32(&ck->ck_core) = ci->ci_core_id; + kstat_kv_init(&ck->ck_thread, "thread", KSTAT_KV_T_UINT32); + kstat_kv_u32(&ck->ck_thread) = ci->ci_smt_id; + + kstat_kv_init(&ck->ck_vendor, "vendor", KSTAT_KV_T_ISTR); + strlcpy(kstat_kv_istr(&ck->ck_vendor), cpu_vendor, /* XXX */ + sizeof(kstat_kv_istr(&ck->ck_vendor))); + + kstat_kv_init(&ck->ck_brand, "brand", KSTAT_KV_T_STR); + kstat_kv_len(&ck->ck_brand) = sizeof(ck->ck_brand_str); + KASSERT(sizeof(ck->ck_brand_str) == sizeof(ci->ci_brand)); + memcpy(ck->ck_brand_str, ci->ci_brand, sizeof(ck->ck_brand_str)); + + kstat_kv_init(&ck->ck_family, "family", KSTAT_KV_T_ISTR); + snprintf(kstat_kv_istr(&ck->ck_family), + sizeof(kstat_kv_istr(&ck->ck_family)), "%02xh", ci->ci_family); + + kstat_kv_init(&ck->ck_model, "model", KSTAT_KV_T_ISTR); + snprintf(kstat_kv_istr(&ck->ck_model), + sizeof(kstat_kv_istr(&ck->ck_model)), "%02xh", ci->ci_model); + + kstat_kv_init(&ck->ck_stepping, "stepping", KSTAT_KV_T_ISTR); + snprintf(kstat_kv_istr(&ck->ck_stepping), + sizeof(kstat_kv_istr(&ck->ck_stepping)), "%02xh", + ci->ci_signature & 0x0f); + + ks->ks_data = ck; + ks->ks_datalen = sizeof(*ck); + ks->ks_softc = ci; + + kstat_install(ks); +} +#endif /* NKSTAT > 0 */ Index: sys/arch/amd64/conf/GENERIC =================================================================== RCS file: /cvs/src/sys/arch/amd64/conf/GENERIC,v retrieving revision 1.518 diff -u -p -r1.518 GENERIC --- sys/arch/amd64/conf/GENERIC 8 Jul 2023 02:43:02 -0000 1.518 +++ sys/arch/amd64/conf/GENERIC 22 Nov 2023 03:58:04 -0000 @@ -35,6 +35,7 @@ isa0 at amdpcib? isa0 at tcpcib? pci* at mainbus0 vmm0 at mainbus0 +cpumon0 at mainbus0 pvbus0 at mainbus0 acpi0 at bios0 Index: sys/arch/amd64/conf/files.amd64 =================================================================== RCS file: /cvs/src/sys/arch/amd64/conf/files.amd64,v retrieving revision 1.109 diff -u -p -r1.109 files.amd64 --- sys/arch/amd64/conf/files.amd64 8 Jul 2023 08:01:10 -0000 1.109 +++ sys/arch/amd64/conf/files.amd64 22 Nov 2023 03:58:04 -0000 @@ -260,6 +260,13 @@ file arch/amd64/amd64/vmm_machdep.c vmm file arch/amd64/amd64/vmm_support.S vmm # +# MSR kstats +# +device cpumon {} +attach cpumon at mainbus +file arch/amd64/amd64/cpumon.c cpumon needs-flag + +# # Machine-independent SD/MMC drivers # include "dev/sdmmc/files.sdmmc" Index: sys/arch/amd64/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v retrieving revision 1.158 diff -u -p -r1.158 cpu.h --- sys/arch/amd64/include/cpu.h 27 Jul 2023 00:28:24 -0000 1.158 +++ sys/arch/amd64/include/cpu.h 22 Nov 2023 03:58:04 -0000 @@ -92,6 +92,11 @@ union vmm_cpu_cap { }; /* + * for xcalls + */ +struct task; + +/* * Locks used to protect struct members in this file: * I immutable after creation * a atomic operations @@ -199,6 +204,7 @@ struct cpu_info { #ifdef MULTIPROCESSOR struct srp_hazard ci_srp_hazards[SRP_HAZARD_NUM]; + struct task *ci_xcalls[4]; #endif struct ksensordev ci_sensordev; Index: sys/arch/amd64/include/intr.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/intr.h,v retrieving revision 1.33 diff -u -p -r1.33 intr.h --- sys/arch/amd64/include/intr.h 14 Dec 2021 18:16:14 -0000 1.33 +++ sys/arch/amd64/include/intr.h 22 Nov 2023 03:58:04 -0000 @@ -207,6 +207,9 @@ void cpu_intr_init(struct cpu_info *); void intr_printconfig(void); void intr_barrier(void *); +struct task; +void cpu_xcall(struct cpu_info *ci, struct task *); + #ifdef MULTIPROCESSOR void x86_send_ipi(struct cpu_info *, int); int x86_fast_ipi(struct cpu_info *, int); @@ -215,6 +218,8 @@ void x86_ipi_handler(void); void x86_setperf_ipi(struct cpu_info *); extern void (*ipifunc[X86_NIPI])(struct cpu_info *); + +extern void Xxcallintr(void); #endif #endif /* !_LOCORE */ Index: sys/arch/amd64/include/intrdefs.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/intrdefs.h,v retrieving revision 1.22 diff -u -p -r1.22 intrdefs.h --- sys/arch/amd64/include/intrdefs.h 31 Aug 2021 17:40:59 -0000 1.22 +++ sys/arch/amd64/include/intrdefs.h 22 Nov 2023 03:58:04 -0000 @@ -54,9 +54,10 @@ #define SIR_CLOCK 61 #define SIR_NET 60 #define SIR_TTY 59 +#define SIR_XCALL 58 -#define LIR_XEN 58 -#define LIR_HYPERV 57 +#define LIR_XEN 57 +#define LIR_HYPERV 56 /* * Maximum # of interrupt sources per CPU. 64 to fit in one word. @@ -84,8 +85,9 @@ #define X86_IPI_START_VMM 0x00000100 #define X86_IPI_STOP_VMM 0x00000200 #define X86_IPI_WBINVD 0x00000400 +#define X86_IPI_XCALL 0x00000800 -#define X86_NIPI 12 +#define X86_NIPI 13 #define IREENT_MAGIC 0x18041969 Index: usr.bin/kstat/kstat.c =================================================================== RCS file: /cvs/src/usr.bin/kstat/kstat.c,v retrieving revision 1.13 diff -u -p -r1.13 kstat.c --- usr.bin/kstat/kstat.c 16 Nov 2023 03:17:34 -0000 1.13 +++ usr.bin/kstat/kstat.c 22 Nov 2023 03:58:04 -0000 @@ -490,6 +490,12 @@ kstat_kv(const void *d, ssize_t len) case KSTAT_KV_U_CYCLES: printf(" cycles"); break; + case KSTAT_KV_U_INSTR: + printf(" instructions"); + break; + case KSTAT_KV_U_UJOULES: + printf(" micro-joules"); + break; default: printf(" unit-type-%u", kv->kv_unit);