? amd64/msrs.c.original ? compile/GENERIC.MP.PROF ? compile/PROFILED.MP ? compile/WITNESS.MP ? conf/PROFILED.MP ? conf/WITNESS.MP Index: amd64/cpu.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v retrieving revision 1.183 diff -u -p -r1.183 cpu.c --- amd64/cpu.c 25 Feb 2024 22:33:09 -0000 1.183 +++ amd64/cpu.c 9 Apr 2024 13:05:05 -0000 @@ -884,8 +884,6 @@ cpu_init_vmm(struct cpu_info *ci) if (!pmap_extract(pmap_kernel(), (vaddr_t)ci->ci_vmxon_region, &ci->ci_vmxon_region_pa)) panic("Can't locate VMXON region in phys mem"); - ci->ci_vmcs_pa = VMX_VMCS_PA_CLEAR; - rw_init(&ci->ci_vmcs_lock, "vmcslock"); } } #endif /* NVMM > 0 */ Index: amd64/cpumon.c =================================================================== RCS file: amd64/cpumon.c diff -N amd64/cpumon.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ amd64/cpumon.c 9 Apr 2024 13:05:05 -0000 @@ -0,0 +1,826 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2022, 2023 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "kstat.h" + +#if NKSTAT == 0 +#error cpumon(4) requires kstat(4) +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MSR_RAPL_PWR_UNIT 0xc0010299 +#define MSR_RAPL_PWR_UNIT_ESU_SHIFT 8 +#define MSR_RAPL_PWR_UNIT_ESU_MASK 0x1f +#define MSR_CORE_ENERGY_STATE 0xc001029a +#define MSR_PKG_ENERGY_STATE 0xc001029b + +#define CPUMON_MSR_AMD_RAPL_PWR_UNIT 0xC0010299 +#define CPUMON_MSR_AMD_RAPL_PWR_UNIT_ESU_SHIFT 8 +#define CPUMON_MSR_AMD_SR_RAPL_PWR_UNIT_ESU_MASK 0x1f +#define CPUMON_MSR_AMD_CORE_ENERGY_STAT 0xC001029A +#define CPUMON_MSR_AMD_PKG_ENERGY_STAT 0xC001029B + +struct cpumon_rapl { + uint64_t rapl_energy; /* accumulator */ + uint32_t rapl_energy_prev; + unsigned int rapl_energy_unit; + uint32_t rapl_energy_msr; +}; + +enum cpumon_core_map { + CPUMON_CORE_MAP_TSC, + CPUMON_CORE_MAP_EFF_PERF, + CPUMON_CORE_MAP_SMI, + CPUMON_CORE_MAP_IRPERF, + + CPUMON_CORE_MAP_TEMP, + CPUMON_CORE_MAP_RAPL, + + CPUMON_CORE_MAP_COUNT +}; + +struct cpumon_core { + struct cpumon_softc *c_sc; + struct cpu_info *c_ci; + struct kstat *c_ks; + + TAILQ_ENTRY(cpumon_core) c_entry; + + unsigned int c_nkvs; + int8_t c_map[CPUMON_CORE_MAP_COUNT]; + + int c_temp_max; + + struct cpumon_rapl c_rapl; + struct task c_rapl_xcall; +}; + +TAILQ_HEAD(cpumon_cores, cpumon_core); + +enum cpumon_pkg_map { + CPUMON_PKG_MAP_RAPL_PP0, + CPUMON_PKG_MAP_RAPL_PKG, + CPUMON_PKG_MAP_RAPL_RAM, + CPUMON_PKG_MAP_RAPL_PP1, + CPUMON_PKG_MAP_RAPL_PSYS, + + CPUMON_PKG_MAP_COUNT +}; + +static const char *cpump_pkg_names[] = { + [CPUMON_PKG_MAP_RAPL_PP0] = "pp0", + [CPUMON_PKG_MAP_RAPL_PKG] = "pkg", + [CPUMON_PKG_MAP_RAPL_RAM] = "ram", + [CPUMON_PKG_MAP_RAPL_PP1] = "pp1", + [CPUMON_PKG_MAP_RAPL_PSYS] = "psys", +}; + +struct cpumon_pkg { + struct cpumon_softc *p_sc; + struct cpu_info *p_ci; + struct kstat *p_ks; + + TAILQ_ENTRY(cpumon_pkg) p_entry; + + unsigned int p_nkvs; + int8_t p_map[CPUMON_PKG_MAP_COUNT]; + + struct cpumon_rapl p_rapl[CPUMON_PKG_MAP_COUNT]; + struct task p_rapl_xcall; +}; + +TAILQ_HEAD(cpumon_pkgs, cpumon_pkg); + +struct cpumon_softc { + struct device sc_dev; + struct task sc_deferred; + + struct cpumon_cores sc_cores; + struct cpumon_pkgs sc_pkgs; + + /* used by the ticks below to wait for all cores/pkgs to read stuff */ + struct refcnt sc_rapl_refs; + + struct timeout sc_core_rapl_tick; + struct timeout sc_pkg_rapl_tick; +}; + +static int cpumon_match(struct device *, void *, void *); +static void cpumon_attach(struct device *, struct device *, void *); + +struct cfdriver cpumon_cd = { + NULL, "cpumon", DV_DULL, CD_SKIPHIBERNATE +}; + +const struct cfattach cpumon_ca = { + sizeof(struct cpumon_softc), cpumon_match, cpumon_attach, NULL, NULL +}; + +static void cpumon_deferred(void *); +static struct cpumon_core * + cpumon_attach_core(struct cpumon_softc *, struct cpu_info *); +static struct cpumon_pkg * + cpumon_attach_pkg(struct cpumon_softc *, struct cpu_info *); + +static void cpumon_core_rapl_tick(void *); +static void cpumon_pkg_rapl_tick(void *); + +static int +cpumon_match(struct device *parent, void *match, void *aux) +{ + const char **busname = (const char **)aux; + + if (strcmp(*busname, cpumon_cd.cd_name) != 0) + return (0); + + return (1); +} + +static void +cpumon_attach(struct device *parent, struct device *self, void *aux) +{ + struct cpumon_softc *sc = (struct cpumon_softc *)self; + + printf("\n"); + + task_set(&sc->sc_deferred, cpumon_deferred, sc); + TAILQ_INIT(&sc->sc_cores); + TAILQ_INIT(&sc->sc_pkgs); + timeout_set_proc(&sc->sc_core_rapl_tick, cpumon_core_rapl_tick, sc); + timeout_set_proc(&sc->sc_pkg_rapl_tick, cpumon_pkg_rapl_tick, sc); + + task_add(systqmp, &sc->sc_deferred); +} + +static inline uint32_t +cpumon_rapl_read_msr(const struct cpumon_rapl *rapl) +{ + return (rdmsr(rapl->rapl_energy_msr)); +} + +static void +cpumon_core_rapl_xcall(void *arg) +{ + struct cpumon_core *c = arg; + struct cpumon_softc *sc = c->c_sc; + struct cpumon_rapl *rapl = &c->c_rapl; + uint32_t energy_now; + uint32_t diff; + + energy_now = cpumon_rapl_read_msr(rapl); + diff = energy_now - rapl->rapl_energy_prev; + + rapl->rapl_energy_prev = energy_now; + rapl->rapl_energy += diff; + + refcnt_rele_wake(&sc->sc_rapl_refs); +} + +static void +cpumon_pkg_rapl_xcall(void *arg) +{ + struct cpumon_pkg *p = arg; + struct cpumon_softc *sc = p->p_sc; + struct cpumon_rapl *rapl; + uint32_t energy_now; + uint32_t diff; + size_t i; + + for (i = 0; i < nitems(p->p_rapl); i++) { + if (p->p_map[i] == 0) + continue; + + rapl = &p->p_rapl[i]; + + energy_now = cpumon_rapl_read_msr(rapl); + diff = energy_now - rapl->rapl_energy_prev; + + rapl->rapl_energy_prev = energy_now; + rapl->rapl_energy += diff; + } + + refcnt_rele_wake(&sc->sc_rapl_refs); +} + +static uint64_t +cpumon_rapl_read(struct cpumon_rapl *rapl, uint32_t energy_now) +{ + uint32_t diff = energy_now - rapl->rapl_energy_prev; + uint64_t energy = rapl->rapl_energy + diff; + + rapl->rapl_energy_prev = energy_now; + rapl->rapl_energy = energy; + + /* XXX i feel like this will overflow */ + return ((energy * 1000000) >> rapl->rapl_energy_unit); +} + +static void +cpumon_probe_core_effperf(struct cpumon_core *c) +{ + uint32_t eax, ebx, ecx, edx; + + CPUID(0x06, eax, ebx, ecx, edx); + + if (ecx & (1 << 0)) { + c->c_map[CPUMON_CORE_MAP_EFF_PERF] = c->c_nkvs; + c->c_nkvs += 2; + } +} + +static void +cpumon_probe_core_intel(struct cpumon_core *c) +{ + struct cpu_info *ci = c->c_ci; + + if (cpuid_level >= 0x06) { + cpumon_probe_core_effperf(c); + + switch (ci->ci_model) { + case 0x45: /* Haswell mobile */ + c->c_map[CPUMON_CORE_MAP_SMI] = c->c_nkvs; + c->c_nkvs += 1; + } + } + + if (ISSET(ci->ci_feature_tpmflags, TPM_SENSOR)) { + c->c_map[CPUMON_CORE_MAP_TEMP] = c->c_nkvs; + c->c_nkvs += 1; + + c->c_temp_max = 100; + + /* Only some Core family chips have MSR_TEMPERATURE_TARGET. */ + if (ci->ci_model == 0x0e && + (rdmsr(MSR_TEMPERATURE_TARGET_UNDOCUMENTED) & + MSR_TEMPERATURE_TARGET_LOW_BIT_UNDOCUMENTED)) + c->c_temp_max = 85; + + /* + * Newer CPUs can tell you what their max temperature is. + * See: '64-ia-32-architectures-software-developer- + * vol-3c-part-3-manual.pdf' + */ + if (ci->ci_model > 0x17 && ci->ci_model != 0x1c && + ci->ci_model != 0x26 && ci->ci_model != 0x27 && + ci->ci_model != 0x35 && ci->ci_model != 0x36) + c->c_temp_max = MSR_TEMPERATURE_TARGET_TJMAX( + rdmsr(MSR_TEMPERATURE_TARGET)); + } +} + +static void +cpumon_probe_core_amd(struct cpumon_core *c) +{ + cpumon_probe_core_effperf(c); + + if (c->c_ci->ci_family >= 0x17) { + uint32_t eax, ebx, ecx, edx; + + CPUID(0x80000008, eax, ebx, ecx, edx); + if (ebx & (1 << 1)) { + c->c_map[CPUMON_CORE_MAP_IRPERF] = c->c_nkvs; + c->c_nkvs += 1; + } + + CPUID(0x80000007, eax, ebx, ecx, edx); + if (edx & (1 << 14)) { + c->c_map[CPUMON_CORE_MAP_RAPL] = c->c_nkvs; + c->c_nkvs += 1; + } + } +} + +static void +cpumon_deferred(void *arg) +{ + struct cpumon_softc *sc = arg; + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + struct cpumon_core *c; + int rapl = 0; + + CPU_INFO_FOREACH(cii, ci) { + sched_peg_curproc(ci); + + c = cpumon_attach_core(sc, ci); + if (c && c->c_map[CPUMON_CORE_MAP_RAPL]) + rapl = 1; + + cpumon_attach_pkg(sc, ci); + } + + atomic_clearbits_int(&curproc->p_flag, P_CPUPEG); + + if (rapl) + timeout_add_sec(&sc->sc_core_rapl_tick, 53); + + if (!TAILQ_EMPTY(&sc->sc_pkgs)) + timeout_add_sec(&sc->sc_pkg_rapl_tick, 5); +} + +static void +cpumon_core_rapl_tick(void *arg) +{ + struct cpumon_softc *sc = arg; + struct cpumon_core *c; + + refcnt_init(&sc->sc_rapl_refs); + + TAILQ_FOREACH(c, &sc->sc_cores, c_entry) { + if (c->c_map[CPUMON_CORE_MAP_RAPL] == 0) { + /* is this even possible? */ + continue; + } + + refcnt_take(&sc->sc_rapl_refs); + cpu_xcall(c->c_ci, &c->c_rapl_xcall); + } + + refcnt_finalize(&sc->sc_rapl_refs, "raplcore"); + + /* this doesnt have to be accurate */ + timeout_add_sec(&sc->sc_core_rapl_tick, 53); +} + +static void +cpumon_pkg_rapl_tick(void *arg) +{ + struct cpumon_softc *sc = arg; + struct cpumon_pkg *p; + + refcnt_init(&sc->sc_rapl_refs); + + TAILQ_FOREACH(p, &sc->sc_pkgs, p_entry) { + refcnt_take(&sc->sc_rapl_refs); + cpu_xcall(p->p_ci, &p->p_rapl_xcall); + } + + refcnt_finalize(&sc->sc_rapl_refs, "raplpkg"); + + /* this doesnt have to be accurate */ + timeout_add_sec(&sc->sc_core_rapl_tick, 7); +} + +struct cpumon_xcall { + struct kstat *cx_ks; + struct cond cx_c; +}; + +static void +cpumon_read_core_xcall(void *arg) +{ + struct cpumon_xcall *cx = arg; + struct kstat *ks = cx->cx_ks; + struct kstat_kv *kvs = ks->ks_data; + struct cpumon_core *c = ks->ks_softc; + unsigned long s; + uint32_t energy_now; + int idx, rapl; + + /* this isn't timing critical */ + idx = c->c_map[CPUMON_CORE_MAP_TEMP]; + if (idx) { + uint64_t msr = rdmsr(MSR_THERM_STATUS); + + if (msr & MSR_THERM_STATUS_VALID_BIT) { + uint64_t v; + + v = c->c_temp_max - MSR_THERM_STATUS_TEMP(msr); + /* micro degrees */ + v *= 1000000; + /* kelvin */ + v += 273150000; + + kvs[idx].kv_type = KSTAT_KV_T_TEMP; + kstat_kv_temp(&kvs[idx]) = v; + } else + kvs[idx].kv_type = KSTAT_KV_T_NULL; + } + + s = intr_disable(); + idx = c->c_map[CPUMON_CORE_MAP_TSC]; + if (idx) + kstat_kv_u64(&kvs[idx]) = rdtsc_lfence(); + + idx = c->c_map[CPUMON_CORE_MAP_EFF_PERF]; + if (idx) { + kstat_kv_u64(&kvs[idx + 0]) = rdmsr(0xe7); + kstat_kv_u64(&kvs[idx + 1]) = rdmsr(0xe8); + } + + idx = c->c_map[CPUMON_CORE_MAP_SMI]; + if (idx) + kstat_kv_u32(&kvs[idx]) = rdmsr(0x34); + + idx = c->c_map[CPUMON_CORE_MAP_IRPERF]; + if (idx) + kstat_kv_u64(&kvs[idx]) = rdmsr(0xe7); + + rapl = c->c_map[CPUMON_CORE_MAP_RAPL]; + if (rapl) + energy_now = cpumon_rapl_read_msr(&c->c_rapl); + + nanouptime(&ks->ks_updated); + intr_restore(s); + + if (rapl) { + kstat_kv_u64(&kvs[rapl]) = + cpumon_rapl_read(&c->c_rapl, energy_now); + } + + cond_signal(&cx->cx_c); +} + +static int +cpumon_read_core(struct kstat *ks) +{ + struct timespec now, diff; + + /* rate limit the updates to roughly twice a second */ + getnanouptime(&now); + timespecsub(&now, &ks->ks_updated, &diff); + if (diff.tv_sec > 0 || diff.tv_nsec > 500000000) { + struct cpumon_xcall cx = { ks, COND_INITIALIZER() }; + struct task t = TASK_INITIALIZER(cpumon_read_core_xcall, &cx); + struct cpumon_core *c = ks->ks_softc; + + cpu_xcall(c->c_ci, &t); + + cond_wait(&cx.cx_c, "cpumonc"); + } + + return (0); +} + +static struct cpumon_core * +cpumon_attach_core(struct cpumon_softc *sc, struct cpu_info *ci) +{ + struct kstat *ks; + struct kstat_kv *kvs; + struct cpumon_core *c; + int idx; + + TAILQ_FOREACH(c, &sc->sc_cores, c_entry) { + if (ci->ci_pkg_id == c->c_ci->ci_pkg_id && + ci->ci_core_id == c->c_ci->ci_core_id) { + /* core is already being monitored */ + + if (ci->ci_smt_id < c->c_ci->ci_smt_id) { + /* prefer low threads */ + c->c_ci = ci; + } + + return (NULL); + } + } + + ks = kstat_create("cpu-core", ci->ci_pkg_id << 24 | ci->ci_core_id, + "cpumon", 0, KSTAT_T_KV, 0); + if (ks == NULL) { + printf("unable to create cpu-core kstat for pkg %u core %d\n", + ci->ci_pkg_id, ci->ci_core_id); + return (NULL); + } + + c = malloc(sizeof(*c), M_DEVBUF, M_WAITOK|M_ZERO); + c->c_sc = sc; + c->c_ci = ci; + c->c_ks = ks; + c->c_nkvs = 2; /* pkg and core ids */ + + /* assume we have tsc */ + c->c_map[CPUMON_CORE_MAP_TSC] = c->c_nkvs; + c->c_nkvs += 1; + + if (strcmp(cpu_vendor, "GenuineIntel") == 0) + cpumon_probe_core_intel(c); + else if (strcmp(cpu_vendor, "AuthenticAMD") == 0) + cpumon_probe_core_amd(c); + + kvs = mallocarray(c->c_nkvs, sizeof(*kvs), M_DEVBUF, M_WAITOK|M_ZERO); + + kstat_kv_init(&kvs[0], "package", KSTAT_KV_T_UINT32); + kstat_kv_u32(&kvs[0]) = ci->ci_pkg_id; + kstat_kv_init(&kvs[1], "core", KSTAT_KV_T_UINT32); + kstat_kv_u32(&kvs[1]) = ci->ci_core_id; + + idx = c->c_map[CPUMON_CORE_MAP_TSC]; + if (idx) { + kstat_kv_unit_init(&kvs[idx], "tsc", + KSTAT_KV_T_COUNTER64, KSTAT_KV_U_CYCLES); + } + + idx = c->c_map[CPUMON_CORE_MAP_EFF_PERF]; + if (idx) { + kstat_kv_init(&kvs[idx + 0], "mperf", KSTAT_KV_T_COUNTER64); + kstat_kv_init(&kvs[idx + 1], "aperf", KSTAT_KV_T_COUNTER64); + } + + idx = c->c_map[CPUMON_CORE_MAP_SMI]; + if (idx) + kstat_kv_init(&kvs[idx], "smi", KSTAT_KV_T_COUNTER32); + + idx = c->c_map[CPUMON_CORE_MAP_IRPERF]; + if (idx) { + uint64_t msr; + + msr = rdmsr(0xC0010015); + SET(msr, (1 << 30)); + wrmsr(0xC0010015, msr); + + kstat_kv_unit_init(&kvs[idx], "irperf", + KSTAT_KV_T_COUNTER64, KSTAT_KV_U_INSTR); + } + + idx = c->c_map[CPUMON_CORE_MAP_TEMP]; + if (idx) + kstat_kv_init(&kvs[idx], "temperature", KSTAT_KV_T_TEMP); + + idx = c->c_map[CPUMON_CORE_MAP_RAPL]; + if (idx) { + uint64_t rapl_pwr_unit; + unsigned int unit; + + rapl_pwr_unit = rdmsr(CPUMON_MSR_AMD_RAPL_PWR_UNIT); + unit = rapl_pwr_unit >> CPUMON_MSR_AMD_RAPL_PWR_UNIT_ESU_SHIFT; + unit &= CPUMON_MSR_AMD_SR_RAPL_PWR_UNIT_ESU_MASK; + + task_set(&c->c_rapl_xcall, cpumon_core_rapl_xcall, c); + + c->c_rapl.rapl_energy_msr = CPUMON_MSR_AMD_CORE_ENERGY_STAT; + c->c_rapl.rapl_energy_prev = rdmsr(c->c_rapl.rapl_energy_msr); + c->c_rapl.rapl_energy_unit = unit; + + kstat_kv_unit_init(&kvs[idx], "energy", + KSTAT_KV_T_COUNTER64, KSTAT_KV_U_UJOULES); + } + + ks->ks_data = kvs; + ks->ks_datalen = c->c_nkvs * sizeof(*kvs); + ks->ks_read = cpumon_read_core; + ks->ks_softc = c; + + kstat_install(ks); + + TAILQ_INSERT_TAIL(&sc->sc_cores, c, c_entry); + + return (c); +} + +static void +cpumon_read_pkg_xcall(void *arg) +{ + struct cpumon_xcall *cx = arg; + struct kstat *ks = cx->cx_ks; + struct kstat_kv *kvs = ks->ks_data; + struct cpumon_pkg *p = ks->ks_softc; + unsigned long s; + uint32_t energy_now[nitems(p->p_map)]; + size_t i; + int idx; + + s = intr_disable(); + for (i = 0; i < nitems(p->p_map); i++) { + if (p->p_map[i] == 0) + continue; + + energy_now[i] = cpumon_rapl_read_msr(&p->p_rapl[i]); + } + + nanouptime(&ks->ks_updated); + intr_restore(s); + + for (i = 0; i < nitems(p->p_map); i++) { + idx = p->p_map[i]; + if (idx == 0) + continue; + + energy_now[i] = cpumon_rapl_read_msr(&p->p_rapl[i]); + kstat_kv_u64(&kvs[idx]) = + cpumon_rapl_read(&p->p_rapl[i], energy_now[i]); + } + + cond_signal(&cx->cx_c); +} + +static int +cpumon_read_pkg(struct kstat *ks) +{ + struct timespec now, diff; + + /* rate limit the updates to roughly twice a second */ + getnanouptime(&now); + timespecsub(&now, &ks->ks_updated, &diff); + if (diff.tv_sec > 0 || diff.tv_nsec > 500000000) { + struct cpumon_xcall cx = { ks, COND_INITIALIZER() }; + struct task t = TASK_INITIALIZER(cpumon_read_pkg_xcall, &cx); + struct cpumon_pkg *p = ks->ks_softc; + + cpu_xcall(p->p_ci, &t); + + cond_wait(&cx.cx_c, "cpumonp"); + } + + return (0); +} + +static uint32_t cpumon_intel_rapl_msrs[] = { + [CPUMON_PKG_MAP_RAPL_PP0] = 0x00000639, + [CPUMON_PKG_MAP_RAPL_PKG] = 0x00000611, + [CPUMON_PKG_MAP_RAPL_RAM] = 0x00000619, + [CPUMON_PKG_MAP_RAPL_PP1] = 0x00000641, + [CPUMON_PKG_MAP_RAPL_PSYS] = 0x0000064D, +}; + +static int +cpumon_probe_pkg_intel(struct cpumon_pkg *p) +{ + struct cpu_info *ci = p->p_ci; + uint64_t rapl_pwr_unit; + unsigned int unit; + struct cpumon_rapl *rapl; + int rv = 0; + + if (ci->ci_family < 0x06) + return (0); + + switch (ci->ci_model) { + case 0x45: /* Haswell mobile */ + rapl_pwr_unit = rdmsr(0x00000606); + unit = rapl_pwr_unit >> + CPUMON_MSR_AMD_RAPL_PWR_UNIT_ESU_SHIFT; + unit &= CPUMON_MSR_AMD_SR_RAPL_PWR_UNIT_ESU_MASK; + + p->p_map[CPUMON_PKG_MAP_RAPL_PP0] = p->p_nkvs; + p->p_nkvs += 1; + + rapl = &p->p_rapl[CPUMON_PKG_MAP_RAPL_PP0]; + rapl->rapl_energy_msr = + cpumon_intel_rapl_msrs[CPUMON_PKG_MAP_RAPL_PP0]; + rapl->rapl_energy_prev = rdmsr(rapl->rapl_energy_msr); + rapl->rapl_energy_unit = unit; + + p->p_map[CPUMON_PKG_MAP_RAPL_PKG] = p->p_nkvs; + p->p_nkvs += 1; + + rapl = &p->p_rapl[CPUMON_PKG_MAP_RAPL_PKG]; + rapl->rapl_energy_msr = + cpumon_intel_rapl_msrs[CPUMON_PKG_MAP_RAPL_PKG]; + rapl->rapl_energy_prev = rdmsr(rapl->rapl_energy_msr); + rapl->rapl_energy_unit = unit; + + p->p_map[CPUMON_PKG_MAP_RAPL_RAM] = p->p_nkvs; + p->p_nkvs += 1; + + rapl = &p->p_rapl[CPUMON_PKG_MAP_RAPL_RAM]; + rapl->rapl_energy_msr = + cpumon_intel_rapl_msrs[CPUMON_PKG_MAP_RAPL_RAM]; + rapl->rapl_energy_prev = rdmsr(rapl->rapl_energy_msr); + rapl->rapl_energy_unit = unit; + + p->p_map[CPUMON_PKG_MAP_RAPL_PP1] = p->p_nkvs; + p->p_nkvs += 1; + + rapl = &p->p_rapl[CPUMON_PKG_MAP_RAPL_PP1]; + rapl->rapl_energy_msr = + cpumon_intel_rapl_msrs[CPUMON_PKG_MAP_RAPL_PP1]; + rapl->rapl_energy_prev = rdmsr(rapl->rapl_energy_msr); + rapl->rapl_energy_unit = unit; + + rv = 1; + } + + return (rv); +} + +static int +cpumon_probe_pkg_amd(struct cpumon_pkg *p) +{ + uint64_t rapl_pwr_unit; + unsigned int unit; + struct cpumon_rapl *rapl; + int rv = 0; + + if (p->p_ci->ci_family >= 0x17) { + uint32_t eax, ebx, ecx, edx; + + CPUID(0x80000007, eax, ebx, ecx, edx); + if (edx & (1 << 14)) { + p->p_map[CPUMON_PKG_MAP_RAPL_PKG] = p->p_nkvs; + p->p_nkvs += 1; + + rapl_pwr_unit = rdmsr(CPUMON_MSR_AMD_RAPL_PWR_UNIT); + unit = rapl_pwr_unit >> + CPUMON_MSR_AMD_RAPL_PWR_UNIT_ESU_SHIFT; + unit &= CPUMON_MSR_AMD_SR_RAPL_PWR_UNIT_ESU_MASK; + + rapl = &p->p_rapl[CPUMON_PKG_MAP_RAPL_PKG]; + + rapl->rapl_energy_msr = CPUMON_MSR_AMD_PKG_ENERGY_STAT; + rapl->rapl_energy_prev = rdmsr(rapl->rapl_energy_msr); + rapl->rapl_energy_unit = unit; + + rv = 1; + } + } + + return (rv); +} + +static struct cpumon_pkg * +cpumon_attach_pkg(struct cpumon_softc *sc, struct cpu_info *ci) +{ + struct kstat *ks; + struct kstat_kv *kvs; + struct cpumon_pkg *p; + int rv = 0; + size_t i; + int idx; + + TAILQ_FOREACH(p, &sc->sc_pkgs, p_entry) { + if (ci->ci_pkg_id == p->p_ci->ci_pkg_id) { + /* pkg is already being monitored */ + + return (NULL); + } + } + + p = malloc(sizeof(*p), M_DEVBUF, M_WAITOK|M_ZERO); + p->p_sc = sc; + p->p_ci = ci; + p->p_nkvs = 1; /* pkg id */ + task_set(&p->p_rapl_xcall, cpumon_pkg_rapl_xcall, p); + + if (strcmp(cpu_vendor, "GenuineIntel") == 0) + rv = cpumon_probe_pkg_intel(p); + else if (strcmp(cpu_vendor, "AuthenticAMD") == 0) + rv = cpumon_probe_pkg_amd(p); + + if (rv == 0) { + free(p, M_DEVBUF, sizeof(*p)); + return (NULL); + } + + ks = kstat_create("cpu-pkg", ci->ci_pkg_id, "cpumon", 0, KSTAT_T_KV, 0); + if (ks == NULL) { + printf("unable to create cpu-pkg kstat for pkg %u\n", + ci->ci_pkg_id); + return (NULL); + } + + kvs = mallocarray(p->p_nkvs, sizeof(*kvs), M_DEVBUF, M_WAITOK|M_ZERO); + + kstat_kv_init(&kvs[0], "package", KSTAT_KV_T_UINT32); + kstat_kv_u32(&kvs[0]) = ci->ci_pkg_id; + + for (i = 0; i < nitems(p->p_map); i++) { + idx = p->p_map[i]; + if (idx == 0) + continue; + + kstat_kv_unit_init(&kvs[idx], cpump_pkg_names[i], + KSTAT_KV_T_COUNTER64, KSTAT_KV_U_UJOULES); + } + + ks->ks_data = kvs; + ks->ks_datalen = p->p_nkvs * sizeof(*kvs); + ks->ks_read = cpumon_read_pkg; + ks->ks_softc = p; + + kstat_install(ks); + + TAILQ_INSERT_TAIL(&sc->sc_pkgs, p, p_entry); + + return (p); +} Index: amd64/identcpu.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/identcpu.c,v retrieving revision 1.138 diff -u -p -r1.138 identcpu.c --- amd64/identcpu.c 3 Sep 2023 09:30:43 -0000 1.138 +++ amd64/identcpu.c 9 Apr 2024 13:05:05 -0000 @@ -36,11 +36,15 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include "kstat.h" + #include #include #include #include #include +#include +#include #include "vmm.h" #include "pvbus.h" @@ -61,6 +65,10 @@ void tsc_timecounter_init(struct cpu_inf void cpu_check_vmm_cap(struct cpu_info *); #endif /* NVMM > 0 */ +#if NKSTAT > 0 +static void cpu_kstat_attach(struct cpu_info *); +#endif + /* sysctl wants this. */ char cpu_model[48]; int cpuspeed; @@ -835,6 +843,10 @@ identifycpu(struct cpu_info *ci) sensor_attach(&ci->ci_sensordev, &ci->ci_hz_sensor); #endif } + +#if NKSTAT > 0 + cpu_kstat_attach(ci); +#endif } #ifndef SMALL_KERNEL @@ -1112,3 +1124,68 @@ cpu_check_vmm_cap(struct cpu_info *ci) } } #endif /* NVMM > 0 */ + +#if NKSTAT > 0 +struct cpu_kstat { + struct kstat_kv ck_package; + struct kstat_kv ck_core; + struct kstat_kv ck_thread; + + struct kstat_kv ck_vendor; + struct kstat_kv ck_brand; + char ck_brand_str[48]; + struct kstat_kv ck_family; + struct kstat_kv ck_model; + struct kstat_kv ck_stepping; +}; + +static void +cpu_kstat_attach(struct cpu_info *ci) +{ + struct kstat *ks; + struct cpu_kstat *ck; + + ks = kstat_create(ci->ci_dev->dv_xname, 0, "mach", 0, KSTAT_T_KV, 0); + if (ks == NULL) { + printf("%s: unable to attach kstat\n", ci->ci_dev->dv_xname); + return; + } + + ck = malloc(sizeof(*ck), M_DEVBUF, M_WAITOK|M_ZERO); + + kstat_kv_init(&ck->ck_package, "package", KSTAT_KV_T_UINT32); + kstat_kv_u32(&ck->ck_package) = ci->ci_pkg_id; + kstat_kv_init(&ck->ck_core, "core", KSTAT_KV_T_UINT32); + kstat_kv_u32(&ck->ck_core) = ci->ci_core_id; + kstat_kv_init(&ck->ck_thread, "thread", KSTAT_KV_T_UINT32); + kstat_kv_u32(&ck->ck_thread) = ci->ci_smt_id; + + kstat_kv_init(&ck->ck_vendor, "vendor", KSTAT_KV_T_ISTR); + strlcpy(kstat_kv_istr(&ck->ck_vendor), cpu_vendor, /* XXX */ + sizeof(kstat_kv_istr(&ck->ck_vendor))); + + kstat_kv_init(&ck->ck_brand, "brand", KSTAT_KV_T_STR); + kstat_kv_len(&ck->ck_brand) = sizeof(ck->ck_brand_str); + KASSERT(sizeof(ck->ck_brand_str) == sizeof(ci->ci_brand)); + memcpy(ck->ck_brand_str, ci->ci_brand, sizeof(ck->ck_brand_str)); + + kstat_kv_init(&ck->ck_family, "family", KSTAT_KV_T_ISTR); + snprintf(kstat_kv_istr(&ck->ck_family), + sizeof(kstat_kv_istr(&ck->ck_family)), "%02xh", ci->ci_family); + + kstat_kv_init(&ck->ck_model, "model", KSTAT_KV_T_ISTR); + snprintf(kstat_kv_istr(&ck->ck_model), + sizeof(kstat_kv_istr(&ck->ck_model)), "%02xh", ci->ci_model); + + kstat_kv_init(&ck->ck_stepping, "stepping", KSTAT_KV_T_ISTR); + snprintf(kstat_kv_istr(&ck->ck_stepping), + sizeof(kstat_kv_istr(&ck->ck_stepping)), "%02xh", + ci->ci_signature & 0x0f); + + ks->ks_data = ck; + ks->ks_datalen = sizeof(*ck); + ks->ks_softc = ci; + + kstat_install(ks); +} +#endif /* NKSTAT > 0 */ Index: amd64/intr.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/intr.c,v retrieving revision 1.56 diff -u -p -r1.56 intr.c --- amd64/intr.c 19 Jan 2024 18:38:16 -0000 1.56 +++ amd64/intr.c 9 Apr 2024 13:05:05 -0000 @@ -559,7 +559,10 @@ struct intrhand fake_softclock_intrhand; struct intrhand fake_softnet_intrhand; struct intrhand fake_softtty_intrhand; struct intrhand fake_timer_intrhand; +#ifdef MULTIPROCESSOR struct intrhand fake_ipi_intrhand; +struct intrhand fake_xcall_intrhand; +#endif #if NXEN > 0 struct intrhand fake_xen_intrhand; #endif @@ -626,6 +629,17 @@ cpu_intr_init(struct cpu_info *ci) isp->is_handlers = &fake_ipi_intrhand; isp->is_pic = &local_pic; ci->ci_isources[LIR_IPI] = isp; + + isp = malloc(sizeof (struct intrsource), M_DEVBUF, M_NOWAIT|M_ZERO); + if (isp == NULL) + panic("can't allocate fixed interrupt source"); + isp->is_recurse = Xxcallintr; + isp->is_resume = Xxcallintr; + fake_xcall_intrhand.ih_level = IPL_SOFTCLOCK; + fake_xcall_intrhand.ih_flags = IPL_MPSAFE; + isp->is_handlers = &fake_xcall_intrhand; + isp->is_pic = &local_pic; + ci->ci_isources[SIR_XCALL] = isp; #endif #if NXEN > 0 isp = malloc(sizeof (struct intrsource), M_DEVBUF, M_NOWAIT|M_ZERO); Index: amd64/ipi.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/ipi.c,v retrieving revision 1.18 diff -u -p -r1.18 ipi.c --- amd64/ipi.c 10 Nov 2022 08:26:54 -0000 1.18 +++ amd64/ipi.c 9 Apr 2024 13:05:05 -0000 @@ -43,9 +43,9 @@ #include void -x86_send_ipi(struct cpu_info *ci, int ipimask) +x86_send_ipi(struct cpu_info *ci, int ipi) { - x86_atomic_setbits_u32(&ci->ci_ipis, ipimask); + x86_atomic_setbits_u32(&ci->ci_ipis, 1U << ipi); /* Don't send IPI to cpu which isn't (yet) running. */ if (!(ci->ci_flags & CPUF_RUNNING)) @@ -66,10 +66,11 @@ x86_fast_ipi(struct cpu_info *ci, int ip } void -x86_broadcast_ipi(int ipimask) +x86_broadcast_ipi(int ipi) { struct cpu_info *ci, *self = curcpu(); int count = 0; + uint32_t ipimask = 1U << ipi; CPU_INFO_ITERATOR cii; CPU_INFO_FOREACH(cii, ci) { @@ -101,7 +102,6 @@ x86_ipi_handler(void) pending = atomic_swap_uint(&ci->ci_ipis, 0); for (bit = 0; bit < X86_NIPI && pending; bit++) { if (pending & (1 << bit)) { - pending &= ~(1 << bit); (*ipifunc[bit])(ci); evcount_inc(&ipi_count); } Index: amd64/ipifuncs.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/ipifuncs.c,v retrieving revision 1.38 diff -u -p -r1.38 ipifuncs.c --- amd64/ipifuncs.c 30 Oct 2023 12:50:59 -0000 1.38 +++ amd64/ipifuncs.c 9 Apr 2024 13:05:05 -0000 @@ -53,61 +53,36 @@ #include -#include "vmm.h" -#if NVMM > 0 -#include -#endif /* NVMM > 0 */ - void x86_64_ipi_nop(struct cpu_info *); void x86_64_ipi_halt(struct cpu_info *); void x86_64_ipi_wbinvd(struct cpu_info *); - -#if NVMM > 0 -void x86_64_ipi_vmclear_vmm(struct cpu_info *); -void x86_64_ipi_start_vmm(struct cpu_info *); -void x86_64_ipi_stop_vmm(struct cpu_info *); -#endif /* NVMM > 0 */ +void x86_64_ipi_xcall(struct cpu_info *); #include "pctr.h" #if NPCTR > 0 #include #define x86_64_ipi_reload_pctr pctr_reload -#else -#define x86_64_ipi_reload_pctr NULL #endif #ifdef MTRR void x86_64_ipi_reload_mtrr(struct cpu_info *); -#else -#define x86_64_ipi_reload_mtrr NULL #endif -void (*ipifunc[X86_NIPI])(struct cpu_info *) = -{ - x86_64_ipi_halt, - x86_64_ipi_nop, -#if NVMM > 0 - x86_64_ipi_vmclear_vmm, -#else - NULL, -#endif - NULL, - x86_64_ipi_reload_pctr, - x86_64_ipi_reload_mtrr, - x86_setperf_ipi, +void (* const ipifunc[X86_NIPI])(struct cpu_info *) = { + [X86_IPI_HALT] = x86_64_ipi_halt, + [X86_IPI_NOP] = x86_64_ipi_nop, +#if NPCTR > 0 + [X86_IPI_PCTR] = x86_64_ipi_reload_pctr, +#endif +#ifdef MTRR + [X86_IPI_MTRR] = x86_64_ipi_reload_mtrr, +#endif + [X86_IPI_WBINVD] = x86_64_ipi_wbinvd, + [X86_IPI_SETPERF] = x86_setperf_ipi, + [X86_IPI_XCALL] = x86_64_ipi_xcall, #ifdef DDB - x86_ipi_db, -#else - NULL, -#endif -#if NVMM > 0 - x86_64_ipi_start_vmm, - x86_64_ipi_stop_vmm, -#else - NULL, - NULL, + [X86_IPI_DDB] = x86_ipi_db, #endif - x86_64_ipi_wbinvd, }; void @@ -141,28 +116,18 @@ x86_64_ipi_reload_mtrr(struct cpu_info * } #endif -#if NVMM > 0 -void -x86_64_ipi_vmclear_vmm(struct cpu_info *ci) -{ - vmclear_on_cpu(ci); -} - void -x86_64_ipi_start_vmm(struct cpu_info *ci) -{ - start_vmm_on_cpu(ci); -} - -void -x86_64_ipi_stop_vmm(struct cpu_info *ci) +x86_64_ipi_wbinvd(struct cpu_info *ci) { - stop_vmm_on_cpu(ci); + wbinvd(); } -#endif /* NVMM > 0 */ void -x86_64_ipi_wbinvd(struct cpu_info *ci) +x86_64_ipi_xcall(struct cpu_info *ci) { - wbinvd(); -} + /* + * this is an inlining of softintr() because we already have + * curcpu() and the SIR_XCALL bit to set. + */ + x86_atomic_setbits_u64(&ci->ci_ipending, 1UL << SIR_XCALL); +}; Index: amd64/mainbus.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/mainbus.c,v retrieving revision 1.52 diff -u -p -r1.52 mainbus.c --- amd64/mainbus.c 21 Feb 2022 11:03:39 -0000 1.52 +++ amd64/mainbus.c 9 Apr 2024 13:05:05 -0000 @@ -49,6 +49,7 @@ #include "bios.h" #include "mpbios.h" #include "vmm.h" +#include "cpumon.h" #include "pvbus.h" #include "efifb.h" @@ -253,6 +254,13 @@ mainbus_attach(struct device *parent, st if (isa_has_been_seen == 0) config_found(self, &mba_iba, mainbus_print); #endif + +#if NCPUMON > 0 + if (ISSET(cpu_info_primary.ci_feature_flags, CPUID_TSC)) { + mba.mba_busname = "cpumon"; + config_found(self, &mba.mba_busname, mainbus_print); + } +#endif /* NCPUMON > 0 */ #if NVMM > 0 if (vmm_enabled()) { Index: amd64/softintr.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/softintr.c,v retrieving revision 1.10 diff -u -p -r1.10 softintr.c --- amd64/softintr.c 11 Sep 2020 09:27:09 -0000 1.10 +++ amd64/softintr.c 9 Apr 2024 13:05:05 -0000 @@ -38,6 +38,9 @@ #include #include +#include +#include + #include #include @@ -169,3 +172,58 @@ softintr_disestablish(void *arg) free(sih, M_DEVBUF, sizeof(*sih)); } + +void +#ifdef MULTIPROCESSOR +cpu_xcall_self(struct task *t) +#else +cpu_xcall(struct cpu_info *ci, struct task *t) +#endif +{ + int s = splsoftclock(); + (*t->t_func)(t->t_arg); + splx(s); +} + +#ifdef MULTIPROCESSOR +void +cpu_xcall(struct cpu_info *ci, struct task *t) +{ + size_t i; + + if (ci == curcpu()) { + /* execute the task immediately on the local cpu */ + cpu_xcall_self(t); + return; + } + + for (;;) { + for (i = 0; i < nitems(ci->ci_xcalls); i++) { + if (atomic_cas_ptr(&ci->ci_xcalls[i], + NULL, t) == NULL) { + /* membar_producer(); */ + x86_send_ipi(ci, X86_IPI_XCALL); + return; + } + } + + CPU_BUSY_CYCLE(); + } +} + +void +cpu_xcall_dispatch(void) +{ + struct cpu_info *ci = curcpu(); + struct task *t; + size_t i; + + for (i = 0; i < nitems(ci->ci_xcalls); i++) { + t = ci->ci_xcalls[i]; + if (t != NULL) { + ci->ci_xcalls[i] = NULL; + (*t->t_func)(t->t_arg); + } + } +} +#endif /* MULTIPROCESSOR */ Index: amd64/vector.S =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/vector.S,v retrieving revision 1.95 diff -u -p -r1.95 vector.S --- amd64/vector.S 12 Feb 2024 01:18:17 -0000 1.95 +++ amd64/vector.S 9 Apr 2024 13:05:05 -0000 @@ -1319,3 +1319,17 @@ KIDTVEC(softclock) jmp retpoline_r13 CODEPATCH_END(CPTAG_RETPOLINE_R13) END(Xsoftclock) + +#ifdef MULTIPROCESSOR +KIDTVEC(xcallintr) + endbr64 + movl $IPL_SOFTCLOCK, CPUVAR(ILEVEL) + sti + incl CPUVAR(IDEPTH) + call _C_LABEL(cpu_xcall_dispatch) + decl CPUVAR(IDEPTH) + CODEPATCH_START + jmp retpoline_r13 + CODEPATCH_END(CPTAG_RETPOLINE_R13) +END(Xcallintr) +#endif /* MULTIPROCESSOR */ Index: amd64/vmm_machdep.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/vmm_machdep.c,v retrieving revision 1.20 diff -u -p -r1.20 vmm_machdep.c --- amd64/vmm_machdep.c 29 Feb 2024 16:10:52 -0000 1.20 +++ amd64/vmm_machdep.c 9 Apr 2024 13:05:05 -0000 @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -933,6 +934,13 @@ vmx_pmap_find_pte_ept(pmap_t pmap, paddr return pte; } +static void +vmm_start_xcall(void *null) +{ + struct cpu_info *ci = curcpu(); + start_vmm_on_cpu(ci); +} + /* * vmm_start * @@ -941,34 +949,24 @@ vmx_pmap_find_pte_ept(pmap_t pmap, paddr int vmm_start(void) { - struct cpu_info *self = curcpu(); -#ifdef MULTIPROCESSOR struct cpu_info *ci; CPU_INFO_ITERATOR cii; + struct task t = TASK_INITIALIZER(vmm_start_xcall, NULL); #ifdef MP_LOCKDEBUG int nticks; #endif /* MP_LOCKDEBUG */ -#endif /* MULTIPROCESSOR */ /* VMM is already running */ - if (self->ci_flags & CPUF_VMM) + ci = curcpu(); + if (ISSET(ci->ci_flags, CPUF_VMM)) return (0); - /* Start VMM on this CPU */ - start_vmm_on_cpu(self); - if (!(self->ci_flags & CPUF_VMM)) { - printf("%s: failed to enter VMM mode\n", - self->ci_dev->dv_xname); - return (EIO); - } - -#ifdef MULTIPROCESSOR - /* Broadcast start VMM IPI */ - x86_broadcast_ipi(X86_IPI_START_VMM); + /* scatter */ + CPU_INFO_FOREACH(cii, ci) + cpu_xcall(ci, &t); + /* gather */ CPU_INFO_FOREACH(cii, ci) { - if (ci == self) - continue; #ifdef MP_LOCKDEBUG nticks = __mp_lock_spinout; #endif /* MP_LOCKDEBUG */ @@ -983,11 +981,17 @@ vmm_start(void) #endif /* MP_LOCKDEBUG */ } } -#endif /* MULTIPROCESSOR */ return (0); } +static void +vmm_stop_xcall(void *null) +{ + struct cpu_info *ci = curcpu(); + stop_vmm_on_cpu(ci); +} + /* * vmm_stop * @@ -996,34 +1000,24 @@ vmm_start(void) int vmm_stop(void) { - struct cpu_info *self = curcpu(); -#ifdef MULTIPROCESSOR struct cpu_info *ci; CPU_INFO_ITERATOR cii; + struct task t = TASK_INITIALIZER(vmm_stop_xcall, NULL); #ifdef MP_LOCKDEBUG int nticks; #endif /* MP_LOCKDEBUG */ -#endif /* MULTIPROCESSOR */ /* VMM is not running */ - if (!(self->ci_flags & CPUF_VMM)) + ci = curcpu(); + if (!ISSET(ci->ci_flags, CPUF_VMM)) return (0); - /* Stop VMM on this CPU */ - stop_vmm_on_cpu(self); - if (self->ci_flags & CPUF_VMM) { - printf("%s: failed to exit VMM mode\n", - self->ci_dev->dv_xname); - return (EIO); - } - -#ifdef MULTIPROCESSOR - /* Stop VMM on other CPUs */ - x86_broadcast_ipi(X86_IPI_STOP_VMM); + /* Scatter */ + CPU_INFO_FOREACH(cii, ci) + cpu_xcall(ci, &t); + /* Gather */ CPU_INFO_FOREACH(cii, ci) { - if (ci == self) - continue; #ifdef MP_LOCKDEBUG nticks = __mp_lock_spinout; #endif /* MP_LOCKDEBUG */ @@ -1038,7 +1032,6 @@ vmm_stop(void) #endif /* MP_LOCKDEBUG */ } } -#endif /* MULTIPROCESSOR */ return (0); } @@ -1155,40 +1148,38 @@ stop_vmm_on_cpu(struct cpu_info *ci) * Flush and clear VMCS on 'ci' by executing vmclear. * */ + +#ifdef MULTIPROCESSOR +struct vmclear_state { + struct vcpu *vcpu; + struct cond cv; +}; + void -vmclear_on_cpu(struct cpu_info *ci) +vmclear_on_cpu(void *arg) { + struct vmclear_state *vmcs = arg; + struct cpu_info *ci = curcpu(); if ((ci->ci_flags & CPUF_VMM) && (ci->ci_vmm_flags & CI_VMM_VMX)) { - if (vmclear(&ci->ci_vmcs_pa)) + if (vmclear(&vmcs->vcpu->vc_control_pa)) panic("VMCLEAR ipi failed"); - atomic_swap_ulong(&ci->ci_vmcs_pa, VMX_VMCS_PA_CLEAR); } + cond_signal(&vmcs->cv); } -#ifdef MULTIPROCESSOR static int vmx_remote_vmclear(struct cpu_info *ci, struct vcpu *vcpu) { -#ifdef MP_LOCKDEBUG - int nticks = __mp_lock_spinout; -#endif /* MP_LOCKDEBUG */ + struct vmclear_state vmcs = { + .vcpu = vcpu, + .cv = COND_INITIALIZER(), + }; + struct task t = TASK_INITIALIZER(vmclear_on_cpu, &vmcs); - rw_enter_write(&ci->ci_vmcs_lock); - atomic_swap_ulong(&ci->ci_vmcs_pa, vcpu->vc_control_pa); - x86_send_ipi(ci, X86_IPI_VMCLEAR_VMM); + cpu_xcall(ci, &t); + cond_wait(&vmcs.cv, "vmcs"); - while (ci->ci_vmcs_pa != VMX_VMCS_PA_CLEAR) { - CPU_BUSY_CYCLE(); -#ifdef MP_LOCKDEBUG - if (--nticks <= 0) { - db_printf("%s: spun out\n", __func__); - db_enter(); - nticks = __mp_lock_spinout; - } -#endif /* MP_LOCKDEBUG */ - } atomic_swap_uint(&vcpu->vc_vmx_vmcs_state, VMCS_CLEARED); - rw_exit_write(&ci->ci_vmcs_lock); return (0); } Index: conf/GENERIC =================================================================== RCS file: /cvs/src/sys/arch/amd64/conf/GENERIC,v retrieving revision 1.522 diff -u -p -r1.522 GENERIC --- conf/GENERIC 15 Feb 2024 16:33:56 -0000 1.522 +++ conf/GENERIC 9 Apr 2024 13:05:05 -0000 @@ -35,6 +35,7 @@ isa0 at amdpcib? isa0 at tcpcib? pci* at mainbus0 vmm0 at mainbus0 +cpumon0 at mainbus0 pvbus0 at mainbus0 acpi0 at bios0 Index: conf/files.amd64 =================================================================== RCS file: /cvs/src/sys/arch/amd64/conf/files.amd64,v retrieving revision 1.109 diff -u -p -r1.109 files.amd64 --- conf/files.amd64 8 Jul 2023 08:01:10 -0000 1.109 +++ conf/files.amd64 9 Apr 2024 13:05:05 -0000 @@ -260,6 +260,13 @@ file arch/amd64/amd64/vmm_machdep.c vmm file arch/amd64/amd64/vmm_support.S vmm # +# MSR kstats +# +device cpumon {} +attach cpumon at mainbus +file arch/amd64/amd64/cpumon.c cpumon needs-flag + +# # Machine-independent SD/MMC drivers # include "dev/sdmmc/files.sdmmc" Index: include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v retrieving revision 1.163 diff -u -p -r1.163 cpu.h --- include/cpu.h 25 Feb 2024 19:15:50 -0000 1.163 +++ include/cpu.h 9 Apr 2024 13:05:05 -0000 @@ -53,6 +53,7 @@ #include #include #include +#include #ifdef _KERNEL @@ -92,6 +93,11 @@ union vmm_cpu_cap { }; /* + * for xcalls + */ +struct task; + +/* * Locks used to protect struct members in this file: * I immutable after creation * a atomic operations @@ -201,6 +207,8 @@ struct cpu_info { #ifdef MULTIPROCESSOR struct srp_hazard ci_srp_hazards[SRP_HAZARD_NUM]; + struct uvm_percpu ci_uvm; /* [o] page cache */ + struct task *ci_xcalls[4]; #endif struct ksensordev ci_sensordev; @@ -224,9 +232,6 @@ struct cpu_info { struct vcpu *ci_guest_vcpu; /* [o] last vcpu resumed */ char ci_panicbuf[512]; - - paddr_t ci_vmcs_pa; - struct rwlock ci_vmcs_lock; struct clockqueue ci_queue; }; Index: include/intr.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/intr.h,v retrieving revision 1.33 diff -u -p -r1.33 intr.h --- include/intr.h 14 Dec 2021 18:16:14 -0000 1.33 +++ include/intr.h 9 Apr 2024 13:05:05 -0000 @@ -207,6 +207,9 @@ void cpu_intr_init(struct cpu_info *); void intr_printconfig(void); void intr_barrier(void *); +struct task; +void cpu_xcall(struct cpu_info *ci, struct task *); + #ifdef MULTIPROCESSOR void x86_send_ipi(struct cpu_info *, int); int x86_fast_ipi(struct cpu_info *, int); @@ -214,7 +217,9 @@ void x86_broadcast_ipi(int); void x86_ipi_handler(void); void x86_setperf_ipi(struct cpu_info *); -extern void (*ipifunc[X86_NIPI])(struct cpu_info *); +extern void (* const ipifunc[X86_NIPI])(struct cpu_info *); + +extern void Xxcallintr(void); #endif #endif /* !_LOCORE */ Index: include/intrdefs.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/intrdefs.h,v retrieving revision 1.23 diff -u -p -r1.23 intrdefs.h --- include/intrdefs.h 4 Jan 2024 20:50:43 -0000 1.23 +++ include/intrdefs.h 9 Apr 2024 13:05:05 -0000 @@ -54,9 +54,10 @@ #define SIR_CLOCK 61 #define SIR_NET 60 #define SIR_TTY 59 +#define SIR_XCALL 58 -#define LIR_XEN 58 -#define LIR_HYPERV 57 +#define LIR_XEN 57 +#define LIR_HYPERV 56 /* * Maximum # of interrupt sources per CPU. 64 to fit in one word. @@ -74,18 +75,16 @@ #define IDT_INTR_LOW (0x20 + NUM_LEGACY_IRQS) #define IDT_INTR_HIGH 0xef -#define X86_IPI_HALT 0x00000001 -#define X86_IPI_NOP 0x00000002 -#define X86_IPI_VMCLEAR_VMM 0x00000004 -#define X86_IPI_PCTR 0x00000010 -#define X86_IPI_MTRR 0x00000020 -#define X86_IPI_SETPERF 0x00000040 -#define X86_IPI_DDB 0x00000080 -#define X86_IPI_START_VMM 0x00000100 -#define X86_IPI_STOP_VMM 0x00000200 -#define X86_IPI_WBINVD 0x00000400 +#define X86_IPI_HALT 0 +#define X86_IPI_NOP 1 +#define X86_IPI_PCTR 2 +#define X86_IPI_MTRR 3 +#define X86_IPI_SETPERF 4 +#define X86_IPI_DDB 5 +#define X86_IPI_WBINVD 6 +#define X86_IPI_XCALL 7 -#define X86_NIPI 12 +#define X86_NIPI 8 #define IREENT_MAGIC 0x18041969 Index: include/vmmvar.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/vmmvar.h,v retrieving revision 1.98 diff -u -p -r1.98 vmmvar.h --- include/vmmvar.h 20 Jan 2024 20:11:24 -0000 1.98 +++ include/vmmvar.h 9 Apr 2024 13:05:05 -0000 @@ -927,7 +927,6 @@ int svm_enter_guest(uint64_t, struct vcp struct region_descriptor *); void start_vmm_on_cpu(struct cpu_info *); void stop_vmm_on_cpu(struct cpu_info *); -void vmclear_on_cpu(struct cpu_info *); void vmm_attach_machdep(struct device *, struct device *, void *); void vmm_activate_machdep(struct device *, int); int vmmioctl_machdep(dev_t, u_long, caddr_t, int, struct proc *);