? usr.sbin/lltrace/lltrace_20220509_164453_bgp-backend.embarrassm.net.trace ? usr.sbin/lltrace/load ? usr.sbin/lltrace/run ? usr.sbin/lltrace/run0 ? usr.sbin/lltrace/runj5 Index: sys/arch/amd64/amd64/conf.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/conf.c,v retrieving revision 1.74 diff -u -p -r1.74 conf.c --- sys/arch/amd64/amd64/conf.c 11 Nov 2021 10:03:08 -0000 1.74 +++ sys/arch/amd64/amd64/conf.c 10 May 2022 05:17:51 -0000 @@ -136,6 +136,7 @@ cdev_decl(cy); #include "bktr.h" #include "ksyms.h" #include "kstat.h" +#include "llt.h" #include "usb.h" #include "uhid.h" #include "fido.h" @@ -212,7 +213,8 @@ struct cdevsw cdevsw[] = cdev_notdef(), /* 28 was LKM */ cdev_notdef(), /* 29 */ cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ - cdev_notdef(), /* 31 */ + cdev_lltrace_init(NLLT,lltrace), + /* 31: lltrace */ cdev_notdef(), /* 32 */ cdev_notdef(), /* 33 */ cdev_notdef(), /* 34 */ Index: sys/arch/amd64/amd64/intr.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/intr.c,v retrieving revision 1.55 diff -u -p -r1.55 intr.c --- sys/arch/amd64/amd64/intr.c 28 Dec 2020 14:23:30 -0000 1.55 +++ sys/arch/amd64/amd64/intr.c 10 May 2022 05:17:51 -0000 @@ -38,6 +38,8 @@ /* #define INTRDEBUG */ +#include "llt.h" + #include #include #include @@ -60,6 +62,11 @@ #include #endif +#if NLLT > 0 +#include +#include +#endif + struct pic softintr_pic = { {0, {NULL}, NULL, 0, "softintr_pic0", NULL, 0, 0}, PIC_SOFT, @@ -520,6 +527,9 @@ intr_handler(struct intrframe *frame, st struct cpu_info *ci = curcpu(); int floor; int rc; +#if NLLT > 0 + struct lltrace_cpu *llt; +#endif #ifdef MULTIPROCESSOR int need_lock; @@ -531,10 +541,27 @@ intr_handler(struct intrframe *frame, st if (need_lock) __mp_lock(&kernel_lock); #endif + +#if NLLT > 0 + llt = lltrace_enter_cpu(ci); /* intr can't sleep, so hold the ref */ +#endif + floor = ci->ci_handled_intr_level; ci->ci_handled_intr_level = ih->ih_level; +#if NLLT > 0 + if (llt != NULL) + lltrace_irq(llt, 0, ih->ih_slot); +#endif rc = (*ih->ih_fun)(ih->ih_arg ? ih->ih_arg : frame); +#if NLLT > 0 + if (llt != NULL) + lltrace_irqret(llt, 0, ih->ih_slot); +#endif ci->ci_handled_intr_level = floor; + +#if NLLT > 0 + lltrace_leave(llt); +#endif #ifdef MULTIPROCESSOR if (need_lock) __mp_unlock(&kernel_lock); Index: sys/arch/amd64/amd64/ipi.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/ipi.c,v retrieving revision 1.17 diff -u -p -r1.17 ipi.c --- sys/arch/amd64/amd64/ipi.c 21 Jan 2020 02:01:50 -0000 1.17 +++ sys/arch/amd64/amd64/ipi.c 10 May 2022 05:17:51 -0000 @@ -32,19 +32,32 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include "llt.h" + #include #include #include +#include #include -#include #include #include #include +#if NLLT > 0 +#include +#endif + void x86_send_ipi(struct cpu_info *ci, int ipimask) { +#if NLLT > 0 + struct lltrace_cpu *llt = lltrace_enter(); + if (llt != NULL) + lltrace_ipi(llt, ci->ci_cpuid); + lltrace_leave(llt); +#endif + x86_atomic_setbits_u32(&ci->ci_ipis, ipimask); /* Don't send IPI to cpu which isn't (yet) running. */ @@ -57,6 +70,13 @@ x86_send_ipi(struct cpu_info *ci, int ip int x86_fast_ipi(struct cpu_info *ci, int ipi) { +#if 0 && NLLT > 0 + struct lltrace_cpu *llt = lltrace_enter(); + if (llt != NULL) + lltrace_ipi(llt, ci->ci_cpuid); + lltrace_leave(llt); +#endif + if (!(ci->ci_flags & CPUF_RUNNING)) return (ENOENT); @@ -72,6 +92,13 @@ x86_broadcast_ipi(int ipimask) int count = 0; CPU_INFO_ITERATOR cii; +#if NLLT > 0 + struct lltrace_cpu *llt = lltrace_enter_cpu(self); + if (llt != NULL) + lltrace_ipi_bcast(llt); + lltrace_leave(llt); +#endif + CPU_INFO_FOREACH(cii, ci) { if (ci == self) continue; @@ -94,18 +121,37 @@ x86_ipi_handler(void) u_int32_t pending; int bit; int floor; +#if NLLT > 0 + struct lltrace_cpu *llt; +#endif floor = ci->ci_handled_intr_level; ci->ci_handled_intr_level = ci->ci_ilevel; +#if NLLT > 0 + llt = lltrace_enter_cpu(ci); + /* this can't sleep, so we can hold the ref */ +#endif + pending = atomic_swap_uint(&ci->ci_ipis, 0); for (bit = 0; bit < X86_NIPI && pending; bit++) { if (pending & (1 << bit)) { pending &= ~(1 << bit); +#if NLLT > 0 + if (llt != NULL) + lltrace_irq(llt, LLTRACE_IRQ_IPI, bit); +#endif (*ipifunc[bit])(ci); +#if NLLT > 0 + if (llt != NULL) + lltrace_irqret(llt, LLTRACE_IRQ_IPI, bit); +#endif ipi_count.ec_count++; } } +#if NLLT > 0 + lltrace_leave(llt); +#endif ci->ci_handled_intr_level = floor; } Index: sys/arch/amd64/amd64/softintr.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/softintr.c,v retrieving revision 1.10 diff -u -p -r1.10 softintr.c --- sys/arch/amd64/amd64/softintr.c 11 Sep 2020 09:27:09 -0000 1.10 +++ sys/arch/amd64/amd64/softintr.c 10 May 2022 05:17:51 -0000 @@ -34,6 +34,8 @@ * Generic soft interrupt implementation for NetBSD/x86. */ +#include "llt.h" + #include #include #include @@ -42,6 +44,11 @@ #include +#if NLLT > 0 +#include +#include +#endif + struct x86_soft_intr x86_soft_intrs[X86_NSOFTINTR]; const int x86_soft_intr_to_ssir[X86_NSOFTINTR] = { @@ -81,6 +88,9 @@ softintr_dispatch(int which) struct x86_soft_intr *si = &x86_soft_intrs[which]; struct x86_soft_intrhand *sih; int floor; +#if NLLT > 0 + struct lltrace_cpu *llt; +#endif floor = ci->ci_handled_intr_level; ci->ci_handled_intr_level = ci->ci_ilevel; @@ -99,8 +109,19 @@ softintr_dispatch(int which) uvmexp.softs++; mtx_leave(&si->softintr_lock); - +#if NLLT > 0 + llt = lltrace_enter_cpu(ci); + if (llt != NULL) + lltrace_irq(llt, LLTRACE_IRQ_BOTTOM_HALF, which); + lltrace_leave(llt); +#endif (*sih->sih_fn)(sih->sih_arg); +#if NLLT > 0 + llt = lltrace_enter_cpu(ci); + if (llt != NULL) + lltrace_irqret(llt, LLTRACE_IRQ_BOTTOM_HALF, which); + lltrace_leave(llt); +#endif } KERNEL_UNLOCK(); Index: sys/arch/amd64/include/cpu.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/cpu.h,v retrieving revision 1.142 diff -u -p -r1.142 cpu.h --- sys/arch/amd64/include/cpu.h 26 Apr 2022 08:35:30 -0000 1.142 +++ sys/arch/amd64/include/cpu.h 10 May 2022 05:17:51 -0000 @@ -108,7 +108,6 @@ struct cpu_info { struct schedstate_percpu ci_schedstate; /* scheduler state */ struct cpu_info *ci_next; - struct proc *ci_curproc; u_int ci_cpuid; u_int ci_apicid; u_int ci_acpi_proc_id; @@ -119,9 +118,10 @@ struct cpu_info { u_int64_t ci_user_cr3; /* U-K page table */ /* bits for mitigating Micro-architectural Data Sampling */ - char ci_mds_tmp[32]; /* 32byte aligned */ + char ci_mds_tmp[32]; /* 32byte aligned */ void *ci_mds_buf; + struct proc *ci_curproc; struct pmap *ci_proc_pmap; /* last userspace pmap */ struct pcb *ci_curpcb; struct pcb *ci_idle_pcb; Index: sys/conf/GENERIC =================================================================== RCS file: /cvs/src/sys/conf/GENERIC,v retrieving revision 1.284 diff -u -p -r1.284 GENERIC --- sys/conf/GENERIC 19 Apr 2022 01:32:06 -0000 1.284 +++ sys/conf/GENERIC 10 May 2022 05:17:53 -0000 @@ -82,6 +82,7 @@ pseudo-device endrun 1 # EndRun line dis pseudo-device vnd 4 # vnode disk devices pseudo-device ksyms 1 # kernel symbols device pseudo-device kstat # kernel statistics device +pseudo-device llt # low-level tracing device # clonable devices pseudo-device bpfilter # packet filter Index: sys/conf/files =================================================================== RCS file: /cvs/src/sys/conf/files,v retrieving revision 1.714 diff -u -p -r1.714 files --- sys/conf/files 19 Mar 2022 10:25:09 -0000 1.714 +++ sys/conf/files 10 May 2022 05:17:53 -0000 @@ -573,6 +573,9 @@ file dev/ksyms.c ksyms needs-flag pseudo-device kstat file dev/kstat.c kstat needs-flag +pseudo-device llt +file dev/lltrace.c llt needs-flag + pseudo-device fuse file miscfs/fuse/fuse_device.c fuse needs-flag file miscfs/fuse/fuse_file.c fuse Index: sys/dev/lltrace.c =================================================================== RCS file: sys/dev/lltrace.c diff -N sys/dev/lltrace.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/dev/lltrace.c 10 May 2022 05:17:53 -0000 @@ -0,0 +1,876 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2022 The University of Queensland + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * This code was written by David Gwynne as part + * of the Information Technology Infrastructure Group (ITIG) in the + * Faculty of Engineering, Architecture and Information Technology + * (EAIT). + * + * It was heavily inspired by and aims to be largely compatible + * with the KUTrace (kernel/userland tracing) framework by Richard + * L. Sites. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#if defined(__amd64__) || defined(__i386__) + +static inline unsigned int +lltrace_cas(unsigned int *p, unsigned int e, unsigned int n) +{ + __asm volatile("cmpxchgl %2, %1" + : "=a" (e), "=m" (*p) + : "r" (n), "a" (e), "m" (*p)); + + return (e); +} + +static inline uint64_t +lltrace_ts(void) +{ + unsigned int hi, lo; + + __asm volatile("lfence; rdtsc" : "=d" (hi), "=a" (lo)); + + return (lo >> 6); +} + +static inline uint64_t +lltrace_ts_long(void) +{ + return (rdtsc_lfence() >> 6); +} + +#else /* not x86 */ + +static unsigned int +lltrace_cas(unsigned int *p, unsigned int e, unsigned int n) +{ + unsigned int o; + int s; + + s = intr_disable(); + o = *p; + if (o == e) + *p = n; + intr_restore(s); + + return (o); +} + +static inline uint64_t +lltrace_ts(void) +{ + return (countertime()); +} + +static inline uint64_t +lltrace_ts_long(void) +{ + return (countertime()); +} + +#endif + +#define LLTRACE_MB2NBUF(_mb) \ + (((_mb) * (1U << 20)) / sizeof(struct lltrace_buffer)) +#define LLTRACE_NBUF2MB(_nbuf) \ + (((_nbuf) * sizeof(struct lltrace_buffer)) / (1U << 20)) + +#define LLTRACE_BLEN_DEFAULT 16 + +struct lltrace_cpu { + SIMPLEQ_ENTRY(lltrace_cpu) + llt_entry; + struct lltrace_buffer llt_buffer; + unsigned int llt_slot; + pid_t llt_tid; +}; + +SIMPLEQ_HEAD(lltrace_cpu_list, lltrace_cpu); + +struct lltrace_softc { + unsigned int sc_running; + unsigned int sc_mode; + struct rwlock sc_lock; + unsigned int sc_nbuffers; + + unsigned int sc_free; + unsigned int sc_used; + struct lltrace_cpu **sc_ring; + struct lltrace_cpu *sc_buffers; + + unsigned int sc_read; + unsigned int sc_reading; + struct selinfo sc_sel; + + uint64_t sc_boottime; + uint64_t sc_monotime; +}; + +static int lltrace_start(struct lltrace_softc *, struct proc *); +static int lltrace_stop(struct lltrace_softc *, struct proc *); +static int lltrace_flush(struct lltrace_softc *); + +static struct lltrace_softc *lltrace_sc; + +static void lltrace_arg32(struct lltrace_cpu *, uint64_t, unsigned int); + +int +lltattach(int num) +{ + return (0); +} + +int +lltraceopen(dev_t dev, int flag, int mode, struct proc *p) +{ + struct lltrace_softc *sc; + int error; + + if (minor(dev) != 0) + return (ENXIO); + + error = suser(p); + if (error != 0) + return (error); + + if (lltrace_sc != NULL) + return (EBUSY); + + sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO); + if (sc == NULL) + return (ENOMEM); + + sc->sc_running = 0; + sc->sc_nbuffers = LLTRACE_MB2NBUF(LLTRACE_BLEN_DEFAULT); + printf("%s[%u]: nbuffers %u\n", __func__, __LINE__, sc->sc_nbuffers); + /* XXX */ + + rw_init(&sc->sc_lock, "lltlk"); + + sc->sc_read = 0; + sc->sc_reading = 0; + klist_init_rwlock(&sc->sc_sel.si_note, &sc->sc_lock); + + /* commit */ + if (atomic_cas_ptr(&lltrace_sc, NULL, sc) != NULL) { + free(sc, M_DEVBUF, sizeof(*sc)); + return (EBUSY); + } + + return (0); +} + +int +lltraceclose(dev_t dev, int flag, int mode, struct proc *p) +{ + struct lltrace_softc *sc = lltrace_sc; + + rw_enter_write(&sc->sc_lock); + lltrace_stop(sc, p); + lltrace_flush(sc); + rw_exit_write(&sc->sc_lock); + + lltrace_sc = NULL; + membar_sync(); + + free(sc, M_DEVBUF, sizeof(*sc)); + + return (0); +} + +static int +lltrace_fionread(struct lltrace_softc *sc) +{ + int canread; + + rw_enter_read(&sc->sc_lock); + canread = !sc->sc_running && (sc->sc_buffers != NULL) && + (sc->sc_read < sc->sc_nbuffers); + rw_exit_read(&sc->sc_lock); + + return (canread ? sizeof(struct lltrace_buffer) : 0); +} + +static void +lltrace_cpu_init(struct lltrace_cpu *llt, struct lltrace_softc *sc, + struct cpu_info *ci, pid_t tid) +{ + struct lltrace_header *llh; + + llh = (struct lltrace_header *)&llt->llt_buffer; + llh->h_cpu = cpu_number(); + llh->h_boottime = sc->sc_boottime; + llh->h_start_cy = lltrace_ts_long(); + llh->h_start_ns = nsecuptime() - sc->sc_monotime; + llh->h_end_cy = 0; + llh->h_end_ns = 0; + llh->h_idletid = ci->ci_schedstate.spc_idleproc->p_tid; + llh->h_tid = tid; + llh->h_zero = 0; + + llt->llt_tid = tid; + llt->llt_slot = 8; +} + +static void +lltrace_cpu_fini(struct lltrace_cpu *llt, struct lltrace_softc *sc) +{ + struct lltrace_header *llh; + + llh = (struct lltrace_header *)&llt->llt_buffer; + llh->h_end_cy = lltrace_ts_long(); + llh->h_end_ns = nsecuptime() - sc->sc_monotime; +} + +static int +lltrace_set_mode(struct lltrace_softc *sc, unsigned int mode) +{ + int error; + + if (mode >= LLTRACE_MODE_COUNT) + return (EINVAL); + + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + return (error); + + if (sc->sc_running) + error = EBUSY; + else + sc->sc_mode = mode; + + rw_exit(&sc->sc_lock); + return (error); +} + +static int +lltrace_set_blen(struct lltrace_softc *sc, unsigned int blen) +{ + int error; + unsigned int nbuffers; + + if (blen < LLTRACE_BLEN_MIN || blen > LLTRACE_BLEN_MAX) + return (EINVAL); + + /* convert megabytes to the number of buffers */ + nbuffers = LLTRACE_MB2NBUF(blen); + if (nbuffers <= ncpus) + EINVAL; + + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + return (error); + + if (sc->sc_buffers != NULL) + error = EBUSY; + else { + sc->sc_nbuffers = nbuffers; + printf("%s[%u]: nbuffers %u\n", __func__, __LINE__, sc->sc_nbuffers); +} + + rw_exit(&sc->sc_lock); + return (error); +} + +static int +lltrace_start(struct lltrace_softc *sc, struct proc *p) +{ + struct bintime boottime; + unsigned int i; + size_t sz; + struct lltrace_cpu_list l = SIMPLEQ_HEAD_INITIALIZER(l); + struct lltrace_cpu *llt; + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + + if (sc->sc_running) + return EINVAL; + + if (sc->sc_nbuffers <= (ncpus * 2 + 1)) + return (EINVAL); + + lltrace_flush(sc); + + sc->sc_monotime = nsecuptime(); + + binboottime(&boottime); + sc->sc_boottime = BINTIME_TO_NSEC(&boottime) + sc->sc_monotime; + + sz = roundup(sc->sc_nbuffers * sizeof(*sc->sc_buffers), PAGE_SIZE); + sc->sc_buffers = km_alloc(sz, &kv_any, &kp_dirty, &kd_waitok); + if (sc->sc_buffers == NULL) + return (ENOMEM); + sc->sc_ring = mallocarray(sc->sc_nbuffers, sizeof(*sc->sc_ring), + M_DEVBUF, M_WAITOK); + for (i = 0; i < sc->sc_nbuffers; i++) { + llt = &sc->sc_buffers[i]; + llt->llt_slot = 0; + sc->sc_ring[i] = llt; + } + + sc->sc_free = 0; /* next slot to pull a free buffer from */ + sc->sc_used = 0; /* next slot to put a used buffer in */ + + CPU_INFO_FOREACH(cii, ci) { + i = sc->sc_free++; /* can't wrap yet */ + + llt = sc->sc_ring[i]; + sc->sc_ring[i] = NULL; + + SIMPLEQ_INSERT_HEAD(&l, llt, llt_entry); + } + + CPU_INFO_FOREACH(cii, ci) { + sched_peg_curproc(ci); + + llt = SIMPLEQ_FIRST(&l); + SIMPLEQ_REMOVE_HEAD(&l, llt_entry); + + lltrace_cpu_init(llt, sc, ci, p->p_tid); + lltrace_pidname(llt, p); + + membar_producer(); + ci->ci_schedstate.spc_lltrace = llt; + } + atomic_clearbits_int(&p->p_flag, P_CPUPEG); + + sc->sc_running = 1; + + return (0); +} + +static int +lltrace_stop(struct lltrace_softc *sc, struct proc *p) +{ + struct lltrace_cpu *llt; + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + unsigned long s; + + if (!sc->sc_running) + return (EALREADY); + + sc->sc_running = 0; + + /* visit each cpu to take llt away safely */ + CPU_INFO_FOREACH(cii, ci) { + sched_peg_curproc(ci); + + s = intr_disable(); + llt = ci->ci_schedstate.spc_lltrace; + ci->ci_schedstate.spc_lltrace = NULL; + intr_restore(s); + + lltrace_cpu_fini(llt, sc); + } + atomic_clearbits_int(&p->p_flag, P_CPUPEG); + + return (0); +} + +static int +lltrace_flush(struct lltrace_softc *sc) +{ + size_t sz; + + rw_assert_wrlock(&sc->sc_lock); + if (sc->sc_running) + return (EBUSY); + + if (sc->sc_buffers == NULL) + return (0); + + sz = roundup(sc->sc_nbuffers * sizeof(*sc->sc_buffers), PAGE_SIZE); + km_free(sc->sc_buffers, sz, &kv_any, &kp_dirty); + free(sc->sc_ring, M_DEVBUF, sc->sc_nbuffers * sizeof(*sc->sc_ring)); + + sc->sc_buffers = NULL; + sc->sc_ring = NULL; + sc->sc_read = 0; + + return (0); +} + +int +lltraceioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) +{ + struct lltrace_softc *sc = lltrace_sc; + int error = 0; + + KERNEL_UNLOCK(); + + switch (cmd) { + case FIONREAD: + *(int *)data = lltrace_fionread(sc); + break; + case FIONBIO: + /* vfs tracks this for us if we let it */ + break; + + case LLTIOCSTART: + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + break; + error = lltrace_start(sc, p); + rw_exit(&sc->sc_lock); + break; + case LLTIOCSTOP: + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + break; + error = lltrace_stop(sc, p); + rw_exit(&sc->sc_lock); + break; + case LLTIOCFLUSH: + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + break; + error = lltrace_flush(sc); + rw_exit(&sc->sc_lock); + break; + + case LLTIOCSBLEN: + error = lltrace_set_blen(sc, *(unsigned int *)data); + break; + case LLTIOCGBLEN: + *(unsigned int *)data = LLTRACE_NBUF2MB(sc->sc_nbuffers); + break; + + case LLTIOCSMODE: + error = lltrace_set_mode(sc, *(unsigned int *)data); + break; + case LLTIOCGMODE: + *(unsigned int *)data = sc->sc_mode; + break; + + default: + error = ENOTTY; + break; + } + + KERNEL_LOCK(); + + return (error); +} + +int +lltraceread(dev_t dev, struct uio *uio, int ioflag) +{ + struct lltrace_softc *sc = lltrace_sc; + struct lltrace_cpu *llt; + unsigned int slot; + int error; + + KERNEL_UNLOCK(); + + error = rw_enter(&sc->sc_lock, RW_WRITE|RW_INTR); + if (error != 0) + goto lock; + + if (sc->sc_running) { + if (ISSET(ioflag, IO_NDELAY)) { + error = EWOULDBLOCK; + goto unlock; + } + + do { + sc->sc_reading++; + error = rwsleep_nsec(&sc->sc_reading, &sc->sc_lock, + PRIBIO|PCATCH, "lltread", INFSLP); + sc->sc_reading--; + if (error != 0) + goto unlock; + } while (sc->sc_running); + } + + if (sc->sc_buffers == NULL) { + error = 0; + goto unlock; + } + + slot = sc->sc_read; + for (;;) { + if (slot >= sc->sc_nbuffers) { + error = 0; + goto unlock; + } + + llt = &sc->sc_buffers[slot]; + KASSERT(llt->llt_slot <= nitems(llt->llt_buffer.llt_slots)); + if (llt->llt_slot > 0) + break; + + slot++; + } + + error = uiomove(&llt->llt_buffer, + llt->llt_slot * sizeof(llt->llt_buffer.llt_slots[0]), uio); + if (error != 0) + goto unlock; + + sc->sc_read = slot + 1; + +unlock: + rw_exit(&sc->sc_lock); +lock: + KERNEL_LOCK(); + return (error); +} + +static void +lltrace_filt_detach(struct knote *kn) +{ + struct lltrace_softc *sc = kn->kn_hook; + + klist_remove(&sc->sc_sel.si_note, kn); +} + +static int +lltrace_filt_event(struct knote *kn, long hint) +{ + struct lltrace_softc *sc = kn->kn_hook; + int canread; + + canread = !sc->sc_running && (sc->sc_buffers != NULL) && + (sc->sc_read < sc->sc_nbuffers); + + kn->kn_data = canread ? sizeof(struct lltrace_buffer) : 0; + + return (canread); +} + +static int +lltrace_filt_modify(struct kevent *kev, struct knote *kn) +{ + struct lltrace_softc *sc = kn->kn_hook; + int active; + + rw_enter_write(&sc->sc_lock); + active = knote_modify_fn(kev, kn, lltrace_filt_event); + rw_exit_write(&sc->sc_lock); + + return (active); +} + +static int +lltrace_filt_process(struct knote *kn, struct kevent *kev) +{ + struct lltrace_softc *sc = kn->kn_hook; + int active; + + rw_enter_write(&sc->sc_lock); + active = knote_process_fn(kn, kev, lltrace_filt_event); + rw_exit_write(&sc->sc_lock); + + return (active); +} + +static const struct filterops lltrace_filtops = { + .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, + .f_attach = NULL, + .f_detach = lltrace_filt_detach, + .f_event = lltrace_filt_event, + .f_modify = lltrace_filt_modify, + .f_process = lltrace_filt_process, +}; + +int +lltracekqfilter(dev_t dev, struct knote *kn) +{ + struct lltrace_softc *sc = lltrace_sc; + struct klist *klist; + + switch (kn->kn_filter) { + case EVFILT_READ: + klist = &sc->sc_sel.si_note; + kn->kn_fop = &lltrace_filtops; + break; + default: + return (EINVAL); + } + + kn->kn_hook = sc; + klist_insert(klist, kn); + + return (0); +} + +static struct lltrace_cpu * +lltrace_next(struct lltrace_cpu *llt) +{ + struct lltrace_softc *sc = lltrace_sc; + struct cpu_info *ci = curcpu(); + struct lltrace_cpu *nllt; + unsigned int slot, oslot, nslot; + + /* check if we were preempted */ + nllt = ci->ci_schedstate.spc_lltrace; + if (nllt != llt) { + /* something preempted us and swapped buffers already */ + return (nllt); + } + + slot = sc->sc_free; + for (;;) { + nslot = slot + 1; + if (nslot > sc->sc_nbuffers) { + if (sc->sc_mode == LLTRACE_MODE_HEAD) + return (NULL); + } + + oslot = atomic_cas_uint(&sc->sc_free, slot, nslot); + if (slot == oslot) + break; + + slot = oslot; + } + + slot %= sc->sc_nbuffers; + nllt = sc->sc_ring[slot]; + sc->sc_ring[slot] = NULL; + + slot = sc->sc_used; + for (;;) { + nslot = slot + 1; + + oslot = atomic_cas_uint(&sc->sc_used, slot, nslot); + if (slot == oslot) + break; + + slot = oslot; + } + + lltrace_cpu_init(nllt, sc, ci, llt->llt_tid); + lltrace_cpu_fini(llt, sc); + + slot %= sc->sc_nbuffers; + sc->sc_ring[slot] = llt; + + ci->ci_schedstate.spc_lltrace = nllt; + + return (nllt); +} + +static unsigned int +lltrace_insert(struct lltrace_cpu *llt, uint64_t record, + const uint64_t *extra, unsigned int n) +{ + unsigned int slot, oslot, nslot; + uint64_t *slots; + + n++; + record |= lltrace_ts() << LLTRACE_TIMESTAMP_SHIFT; + + slot = llt->llt_slot; + for (;;) { + nslot = slot + n; + if (nslot > nitems(llt->llt_buffer.llt_slots)) { + unsigned long s; + + s = intr_disable(); + llt = lltrace_next(llt); + intr_restore(s); + + if (llt == NULL) + return (1); + + slot = llt->llt_slot; + continue; + } + + oslot = lltrace_cas(&llt->llt_slot, slot, nslot); + if (slot == oslot) + break; + + slot = oslot; + } + + slots = llt->llt_buffer.llt_slots + slot; + *slots = record; + while (n > 1) { + *(++slots) = *(extra++); + n--; + } + + return (0); +} + +void +lltrace_statclock(struct lltrace_cpu *llt, int usermode, unsigned long pc) +{ + uint64_t event = usermode ? LLTRACE_EVENT_PC_U : LLTRACE_EVENT_PC_K; + uint64_t extra[1] = { pc }; + + lltrace_insert(llt, (event | nitems(extra)) << LLTRACE_EVENT_SHIFT, + extra, nitems(extra)); +} + +void +lltrace_syscall(struct lltrace_cpu *llt, register_t code, + size_t argsize, const register_t *args) +{ + uint64_t record = LLTRACE_EVENT_SYSCALL(code) << LLTRACE_EVENT_SHIFT; + + if (argsize > 0) { + uint64_t arg0 = args[0] & LLTRACE_ARG0_MASK; + record |= arg0 << LLTRACE_ARG0_SHIFT; + } + + lltrace_insert(llt, record, NULL, 0); +} + +void +lltrace_sysret(struct lltrace_cpu *llt, register_t code, + int error, const register_t retvals[2]) +{ + uint64_t record = LLTRACE_EVENT_SYSRET(code) << LLTRACE_EVENT_SHIFT; + uint64_t arg0 = error & LLTRACE_ARG0_MASK; + record |= arg0 << LLTRACE_ARG0_SHIFT; + unsigned int stop; + + stop = lltrace_insert(llt, record, NULL, 0); + + if (stop) { + struct lltrace_softc *sc = lltrace_sc; + + rw_enter_write(&sc->sc_lock); + if (sc->sc_running) + lltrace_stop(sc, curproc); + + KNOTE(&sc->sc_sel.si_note, 0); + if (sc->sc_reading) + wakeup(&sc->sc_reading); + rw_exit_write(&sc->sc_lock); + } +} + +void +lltrace_pidname(struct lltrace_cpu *llt, struct proc *p) +{ + uint64_t record; + uint64_t extra[3]; + unsigned int l, n; + + CTASSERT(sizeof(extra) == sizeof(p->p_p->ps_comm)); + + extra[0] = extra[1] = extra[2] = 0; /* memset */ + l = strlcpy((char *)extra, p->p_p->ps_comm, sizeof(extra)); + + /* turn the string length into the number of slots we need */ + n = howmany(l, sizeof(uint64_t)); + + record = (LLTRACE_EVENT_PID | n) << LLTRACE_EVENT_SHIFT; + record |= (p->p_tid & LLTRACE_ARG32_MASK) << LLTRACE_ARG32_SHIFT; + + llt->llt_tid = p->p_tid; + + lltrace_insert(llt, record, extra, n); +} + +void +lltrace_sched_enter(struct lltrace_cpu *llt) +{ + uint64_t record = LLTRACE_EVENT_SCHED << LLTRACE_EVENT_SHIFT; + + lltrace_insert(llt, record, NULL, 0); +} + +void +lltrace_sched_leave(struct lltrace_cpu *llt) +{ + uint64_t record = LLTRACE_EVENT_SCHEDRET << LLTRACE_EVENT_SHIFT; + + lltrace_insert(llt, record, NULL, 0); +} + +void +lltrace_idle(struct lltrace_cpu *llt) +{ + uint64_t record = LLTRACE_EVENT_IDLE << LLTRACE_EVENT_SHIFT; + + lltrace_insert(llt, record, NULL, 0); +} + +static void +lltrace_arg32(struct lltrace_cpu *llt, uint64_t event, unsigned int arg32) +{ + uint64_t record; + + record = event << LLTRACE_EVENT_SHIFT; + record |= (arg32 & LLTRACE_ARG32_MASK) << LLTRACE_ARG32_SHIFT; + + lltrace_insert(llt, record, NULL, 0); +} + +void +lltrace_runnable(struct lltrace_cpu *llt, struct proc *p) +{ + lltrace_arg32(llt, LLTRACE_EVENT_RUNNABLE, p->p_tid); +} + +void +lltrace_trap(struct lltrace_cpu *llt, unsigned int trap) +{ + lltrace_arg32(llt, LLTRACE_EVENT_TRAP, trap); +} + +void +lltrace_trapret(struct lltrace_cpu *llt, unsigned int trap) +{ + lltrace_arg32(llt, LLTRACE_EVENT_TRAPRET, trap); +} + +void +lltrace_ipi(struct lltrace_cpu *llt, unsigned int cpu) +{ + lltrace_arg32(llt, LLTRACE_EVENT_IPI, cpu); +} + +void +lltrace_irq(struct lltrace_cpu *llt, unsigned int type, unsigned int vec) +{ + lltrace_arg32(llt, LLTRACE_EVENT_IRQ(type), vec); +} + +void +lltrace_irqret(struct lltrace_cpu *llt, unsigned int type, unsigned int vec) +{ + lltrace_arg32(llt, LLTRACE_EVENT_IRQRET(type), vec); +} + +void +lltrace_lock(struct lltrace_cpu *llt, void *lock, unsigned int op) +{ + lltrace_arg32(llt, LLTRACE_EVENT_LOCK(op), (uint32_t)(intptr_t)lock); +} Index: sys/kern/kern_clock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_clock.c,v retrieving revision 1.103 diff -u -p -r1.103 kern_clock.c --- sys/kern/kern_clock.c 16 Feb 2022 08:01:32 -0000 1.103 +++ sys/kern/kern_clock.c 10 May 2022 05:17:56 -0000 @@ -60,6 +60,11 @@ #include #endif +#include "llt.h" +#if NLLT > 0 +#include +#endif + /* * Clock handling routines. * @@ -327,6 +332,17 @@ statclock(struct clockframe *frame) setstatclockrate(profhz); } } + +#if NLLT > 0 + { + struct lltrace_cpu *llt = lltrace_enter_spc(spc); + if (llt != NULL) { + lltrace_statclock(llt, + CLKF_USERMODE(frame), CLKF_PC(frame)); + } + lltrace_leave(llt); + } +#endif if (CLKF_USERMODE(frame)) { pr = p->p_p; Index: sys/kern/kern_exec.c =================================================================== RCS file: /cvs/src/sys/kern/kern_exec.c,v retrieving revision 1.230 diff -u -p -r1.230 kern_exec.c --- sys/kern/kern_exec.c 22 Feb 2022 17:14:14 -0000 1.230 +++ sys/kern/kern_exec.c 10 May 2022 05:17:56 -0000 @@ -33,6 +33,8 @@ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include "llt.h" + #include #include #include @@ -67,6 +69,10 @@ #include +#if NLLT > 0 +#include +#endif + struct uvm_object *sigobject; /* shared sigcode object */ struct uvm_object *timekeep_object; struct timekeep *timekeep; @@ -510,6 +516,15 @@ sys_execve(struct proc *p, void *v, regi memset(pr->ps_comm, 0, sizeof(pr->ps_comm)); strlcpy(pr->ps_comm, nid.ni_cnd.cn_nameptr, sizeof(pr->ps_comm)); pr->ps_acflag &= ~AFORK; + +#if NLLT > 0 + { + struct lltrace_cpu *llt = lltrace_enter(); + if (llt != NULL) + lltrace_pidname(llt, p); + lltrace_leave(llt); + } +#endif /* record proc's vnode, for use by sysctl */ otvp = pr->ps_textvp; Index: sys/kern/kern_lock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_lock.c,v retrieving revision 1.72 diff -u -p -r1.72 kern_lock.c --- sys/kern/kern_lock.c 26 Apr 2022 15:31:14 -0000 1.72 +++ sys/kern/kern_lock.c 10 May 2022 05:17:56 -0000 @@ -18,6 +18,8 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include "llt.h" + #include #include #include @@ -41,6 +43,10 @@ int __mp_lock_spinout = INT_MAX; #include struct __mp_lock kernel_lock; +#if NLLT > 0 +#include +#endif + /* * Functions for manipulating the kernel_lock. We put them here * so that they show up in profiles. @@ -132,6 +138,14 @@ __mp_lock(struct __mp_lock *mpl) { struct __mp_lock_cpu *cpu = &mpl->mpl_cpus[cpu_number()]; unsigned long s; +#if NLLT > 0 + struct lltrace_cpu *llt; + + llt = lltrace_enter(); + if (llt != NULL) + lltrace_lock(llt, mpl, LLTRACE_LOCK_NOACQUIRE); + lltrace_leave(llt); +#endif #ifdef WITNESS if (!__mp_lock_held(mpl, curcpu())) @@ -139,6 +153,7 @@ __mp_lock(struct __mp_lock *mpl) LOP_EXCLUSIVE | LOP_NEWORDER, NULL); #endif + s = intr_disable(); if (cpu->mplc_depth++ == 0) cpu->mplc_ticket = atomic_inc_int_nv(&mpl->mpl_users); @@ -148,6 +163,13 @@ __mp_lock(struct __mp_lock *mpl) membar_enter_after_atomic(); WITNESS_LOCK(&mpl->mpl_lock_obj, LOP_EXCLUSIVE); + +#if NLLT > 0 + llt = lltrace_enter(); + if (llt != NULL) + lltrace_lock(llt, mpl, LLTRACE_LOCK_ACQUIRE); + lltrace_leave(llt); +#endif } void @@ -167,6 +189,12 @@ __mp_unlock(struct __mp_lock *mpl) s = intr_disable(); if (--cpu->mplc_depth == 0) { +#if NLLT > 0 + struct lltrace_cpu *llt = lltrace_enter(); + if (llt != NULL) + lltrace_lock(llt, mpl, LLTRACE_LOCK_WAKEUP); + lltrace_leave(llt); +#endif membar_exit(); mpl->mpl_ticket++; } @@ -183,6 +211,15 @@ __mp_release_all(struct __mp_lock *mpl) int i; #endif +#if NLLT > 0 + struct lltrace_cpu *llt = lltrace_enter(); + if (llt != NULL) { + if (mpl->mpl_ticket != mpl->mpl_users) + lltrace_lock(llt, mpl, LLTRACE_LOCK_WAKEUP); + } + lltrace_leave(llt); +#endif + s = intr_disable(); rv = cpu->mplc_depth; #ifdef WITNESS @@ -443,3 +480,85 @@ _mtx_init_flags(struct mutex *m, int ipl _mtx_init(m, ipl); } #endif /* WITNESS */ + +void +NET_LOCK(void) +{ +#if NLLT > 0 + struct lltrace_cpu *llt; + + llt = lltrace_enter(); + if (llt != NULL) + lltrace_lock(llt, &netlock, LLTRACE_LOCK_NOACQUIRE); + lltrace_leave(llt); +#endif + rw_enter_write(&netlock); +#if NLLT > 0 + llt = lltrace_enter(); + if (llt != NULL) + lltrace_lock(llt, &netlock, LLTRACE_LOCK_ACQUIRE); + lltrace_leave(llt); +#endif +} + +void +NET_UNLOCK(void) +{ +#if NLLT > 0 + struct lltrace_cpu *llt; +#endif + rw_exit_write(&netlock); +#if NLLT > 0 + llt = lltrace_enter(); + if (llt != NULL) + lltrace_lock(llt, &netlock, LLTRACE_LOCK_WAKEUP); + lltrace_leave(llt); +#endif +} + +void +NET_RLOCK_IN_SOFTNET(void) +{ +#if NLLT > 0 + struct lltrace_cpu *llt; + + llt = lltrace_enter(); + if (llt != NULL) + lltrace_lock(llt, &netlock, LLTRACE_LOCK_NOACQUIRE); + lltrace_leave(llt); +#endif + rw_enter_write(&netlock); +#if NLLT > 0 + llt = lltrace_enter(); + if (llt != NULL) + lltrace_lock(llt, &netlock, LLTRACE_LOCK_ACQUIRE); + lltrace_leave(llt); +#endif +} + +void +NET_RUNLOCK_IN_SOFTNET(void) +{ +#if NLLT > 0 + struct lltrace_cpu *llt; +#endif + rw_exit_write(&netlock); +#if NLLT > 0 + llt = lltrace_enter(); + if (llt != NULL) + lltrace_lock(llt, &netlock, LLTRACE_LOCK_WAKEUP); + lltrace_leave(llt); +#endif +} + +void +NET_RLOCK_IN_IOCTL(void) +{ + rw_enter_read(&netlock); +} + +void +NET_RUNLOCK_IN_IOCTL(void) +{ + rw_exit_read(&netlock); +} Index: sys/kern/kern_sched.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sched.c,v retrieving revision 1.74 diff -u -p -r1.74 kern_sched.c --- sys/kern/kern_sched.c 20 Jan 2022 11:06:57 -0000 1.74 +++ sys/kern/kern_sched.c 10 May 2022 05:17:56 -0000 @@ -15,6 +15,8 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include "llt.h" + #include #include @@ -30,6 +32,10 @@ #include +#if NLLT > 0 +#include +#endif + void sched_kthreads_create(void *); int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); @@ -185,6 +191,15 @@ sched_idle(void *v) spc->spc_whichqs ? 0 : SPCF_HALTED); SCHED_UNLOCK(s); wakeup(spc); + } +#endif +#if NLLT > 0 + { + struct lltrace_cpu *llt; + llt = lltrace_enter_spc(spc); + if (llt != NULL) + lltrace_idle(llt); + lltrace_leave(llt); } #endif cpu_idle_cycle(); Index: sys/kern/kern_synch.c =================================================================== RCS file: /cvs/src/sys/kern/kern_synch.c,v retrieving revision 1.186 diff -u -p -r1.186 kern_synch.c --- sys/kern/kern_synch.c 30 Apr 2022 14:44:04 -0000 1.186 +++ sys/kern/kern_synch.c 10 May 2022 05:17:56 -0000 @@ -37,6 +37,8 @@ * @(#)kern_synch.c 8.6 (Berkeley) 1/21/94 */ +#include "llt.h" + #include #include #include @@ -65,6 +67,10 @@ #include #endif +#if NLLT > 0 +#include +#endif + int sleep_signal_check(void); int thrsleep(struct proc *, struct sys___thrsleep_args *); int thrsleep_unlock(void *); @@ -544,6 +550,14 @@ unsleep(struct proc *p) p->p_wchan = NULL; TRACEPOINT(sched, wakeup, p->p_tid + THREAD_PID_OFFSET, p->p_p->ps_pid); +#if NLLT > 0 + { + struct lltrace_cpu *llt = lltrace_enter(); + if (llt != NULL) + lltrace_runnable(llt, p); + lltrace_leave(llt); + } +#endif } } Index: sys/kern/kern_tc.c =================================================================== RCS file: /cvs/src/sys/kern/kern_tc.c,v retrieving revision 1.75 diff -u -p -r1.75 kern_tc.c --- sys/kern/kern_tc.c 24 Oct 2021 00:02:25 -0000 1.75 +++ sys/kern/kern_tc.c 10 May 2022 05:17:56 -0000 @@ -140,6 +140,14 @@ tc_delta(struct timehands *th) tc->tc_counter_mask); } +unsigned int +countertime(void) +{ + struct timecounter *tc = timehands->th_counter; + + return (tc->tc_get_timecount(tc)); +} + /* * Functions for reading the time. We have to loop until we are sure that * the timehands that we operated on was not updated under our feet. See Index: sys/kern/sched_bsd.c =================================================================== RCS file: /cvs/src/sys/kern/sched_bsd.c,v retrieving revision 1.70 diff -u -p -r1.70 sched_bsd.c --- sys/kern/sched_bsd.c 30 Oct 2021 23:24:48 -0000 1.70 +++ sys/kern/sched_bsd.c 10 May 2022 05:17:56 -0000 @@ -54,7 +54,6 @@ #include #endif - int lbolt; /* once a second sleep address */ int rrticks_init; /* # of hardclock ticks per roundrobin() */ @@ -323,6 +322,8 @@ mi_switch(void) int sched_count; #endif + LLTRACE(lltrace_sched_enter); + assertwaitok(); KASSERT(p->p_stat != SONPROC); @@ -370,10 +371,13 @@ mi_switch(void) if (p != nextproc) { uvmexp.swtch++; + TRACEPOINT(sched, off__cpu, nextproc->p_tid + THREAD_PID_OFFSET, nextproc->p_p->ps_pid); cpu_switchto(p, nextproc); TRACEPOINT(sched, on__cpu, NULL); + + LLTRACE(lltrace_pidname, p); } else { TRACEPOINT(sched, remain__cpu, NULL); p->p_stat = SONPROC; @@ -394,6 +398,8 @@ mi_switch(void) #endif SCHED_ASSERT_UNLOCKED(); + + LLTRACE(lltrace_sched_leave); smr_idle(); Index: sys/sys/conf.h =================================================================== RCS file: /cvs/src/sys/sys/conf.h,v retrieving revision 1.156 diff -u -p -r1.156 conf.h --- sys/sys/conf.h 23 Jan 2021 05:08:36 -0000 1.156 +++ sys/sys/conf.h 10 May 2022 05:17:56 -0000 @@ -335,6 +335,13 @@ extern struct cdevsw cdevsw[]; (dev_type_stop((*))) enodev, 0, selfalse, \ (dev_type_mmap((*))) enodev } +/* open, close, read, ioctl, poll, kqfilter */ +#define cdev_lltrace_init(c,n) { \ + dev_init(c,n,open), dev_init(c,n,close), dev_init(c,n,read), \ + (dev_type_write((*))) enodev, dev_init(c,n,ioctl), \ + (dev_type_stop((*))) enodev, 0, selfalse, \ + (dev_type_mmap((*))) enodev, 0, 0, dev_init(c,n,kqfilter) } + /* open, close, read, write, ioctl, stop, tty, poll, mmap, kqfilter */ #define cdev_wsdisplay_init(c,n) { \ dev_init(c,n,open), dev_init(c,n,close), dev_init(c,n,read), \ @@ -620,6 +627,7 @@ cdev_decl(wsmux); cdev_decl(ksyms); cdev_decl(kstat); +cdev_decl(lltrace); cdev_decl(bio); cdev_decl(vscsi); Index: sys/sys/lltrace.h =================================================================== RCS file: sys/sys/lltrace.h diff -N sys/sys/lltrace.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/sys/lltrace.h 10 May 2022 05:17:56 -0000 @@ -0,0 +1,297 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2022 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _SYS_LLTRACE_H_ +#define _SYS_LLTRACE_H_ + +/* + * lltrace is heavily based KUTrace (kernel/userland tracing) by + * Richard L. Sites. + */ + +#define LLTRACE_NSLOTS 8192 + +struct lltrace_buffer { + uint64_t llt_slots[LLTRACE_NSLOTS]; +}; + +#define LLTIOCSTART _IO('t',128) +#define LLTIOCSTOP _IO('t',129) +#define LLTIOCFLUSH _IO('t',130) + +/* + * trace until all the buffers are used, or trace and reuse buffers. + */ +#define LLTRACE_MODE_HEAD 0 +#define LLTRACE_MODE_TAIL 1 +#define LLTRACE_MODE_COUNT 2 + +#define LLTIOCSMODE _IOW('t', 131, unsigned int) +#define LLTIOCGMODE _IOR('t', 131, unsigned int) + +/* + * how much memory in MB to allocate for lltrace_buffer structs + * during tracing. + */ + +#define LLTRACE_BLEN_MIN 1 +#define LLTRACE_BLEN_MAX 128 + +#define LLTIOCSBLEN _IOW('t', 132, unsigned int) +#define LLTIOCGBLEN _IOR('t', 132, unsigned int) + +/* + * lltrace collects kernel events in per-CPU buffers. + */ + +/* + * The first 8 words of the per-CPU buffer are dedicated to metadata + * about the CPU and the period of time over which events were + * collected. + */ + +struct lltrace_header { + /* slots[0] */ + uint64_t h_cpu; + + /* slots[1] */ + uint64_t h_boottime; + + /* slots[2] */ + uint64_t h_start_cy; + /* slots[3] */ + uint64_t h_start_ns; + /* slots[4] */ + uint64_t h_end_cy; + /* slots[5] */ + uint64_t h_end_ns; + + /* slots[6] */ + uint32_t h_idletid; + uint32_t h_tid; + /* slots[7] */ + uint64_t h_zero; +}; + +/* + * The high 32-bits of the trace entry contain a timestamp and event id. + */ + +#define LLTRACE_TIMESTAMP_SHIFT 44 +#define LLTRACE_TIMESTAMP_BITS 20 +#define LLTRACE_TIMESTAMP_MASK ((1ULL << LLTRACE_TIMESTAMP_BITS) - 1) + +#define LLTRACE_EVENT_SHIFT 32 +#define LLTRACE_EVENT_BITS 12 +#define LLTRACE_EVENT_MASK ((1ULL << LLTRACE_EVENT_BITS) - 1) + +/* + * The low 32-bits vary depending on the event id. + */ + +/* full 32 bits are used */ +#define LLTRACE_ARG32_SHIFT 0 +#define LLTRACE_ARG32_BITS 32 +#define LLTRACE_ARG32_MASK ((1ULL << LLTRACE_ARG32_BITS) - 1) + +/* layout for syscalls/traps/irqs */ +#define LLTRACE_ARG0_SHIFT 0 +#define LLTRACE_ARG0_BITS 16 +#define LLTRACE_ARG0_MASK ((1ULL << LLTRACE_ARG0_BITS) - 1) + +#define LLTRACE_RETVAL_SHIFT 16 +#define LLTRACE_RETVAL_BITS 8 +#define LLTRACE_RETVAL_MASK ((1ULL << LLTRACE_RETVAL_BITS) - 1) + +#define LLTRACE_DUR_SHIFT 24 +#define LLTRACE_DUR_BITS 8 +#define LLTRACE_DUR_MASK ((1ULL << LLTRACE_DUR_BITS) - 1) + +/* + * lltrace event types + */ + +/* + * the high 3 bits of the event id defines how the rest of the bits are used. + */ + +#define LLTRACE_EVENT_T_MASK (0x7ULL << 9) +#define LLTRACE_EVENT_T_VARLEN (0x0ULL << 9) +#define LLTRACE_EVENT_T_MARK (0x1ULL << 9) +#define LLTRACE_EVENT_T_IRQ (0x2ULL << 9) +#define LLTRACE_EVENT_T_SYSCALL (0x4ULL << 9) +#define LLTRACE_EVENT_T_SYSRET (0x5ULL << 9) + +/* + * variable len events use extra slots on the ring. + */ + +#define LLTRACE_EVENT_VARLEN_MASK (0x00fULL) /* low 4bits are the len */ + +#define LLTRACE_EVENT_PID (LLTRACE_EVENT_T_VARLEN | 0x10) +#define LLTRACE_EVENT_LOCKNAME (LLTRACE_EVENT_T_VARLEN | 0x70) + +/* hardcode the space used by PC entries */ +#define LLTRACE_EVENT_PC_K (LLTRACE_EVENT_T_VARLEN | 0x80) +#define LLTRACE_EVENT_PC_U (LLTRACE_EVENT_T_VARLEN | 0x90) + +/* + * mark a particular event occuring + */ + +#define LLTRACE_EVENT_IDLE (LLTRACE_EVENT_T_MARK | 0x0) + +#define LLTRACE_EVENT_RUNNABLE (LLTRACE_EVENT_T_MARK | 0x1) + /* arg32 is tid */ + +#define LLTRACE_EVENT_TRAP (LLTRACE_EVENT_T_MARK | 0x2) +#define LLTRACE_EVENT_TRAPRET (LLTRACE_EVENT_T_MARK | 0x3) + /* arg32 is trap id */ +#define LLTRACE_TRAP_PAGEFAULT 14 /* as per kutrace */ + +#define LLTRACE_EVENT_SCHED (LLTRACE_EVENT_T_MARK | 0x4) +#define LLTRACE_EVENT_SCHEDRET (LLTRACE_EVENT_T_MARK | 0x5) + +#define LLTRACE_EVENT_IPI (LLTRACE_EVENT_T_MARK | 0x6) + /* arg32 is cpu */ + +#define LLTRACE_EVENT_LOCK(_t) (LLTRACE_EVENT_T_MARK | 0x10 | (_t)) +#define LLTRACE_LOCK_NOACQUIRE (0x00) +#define LLTRACE_LOCK_ACQUIRE (0x01) +#define LLTRACE_LOCK_WAKEUP (0x02) + +/* + * irqs + */ + +#define LLTRACE_EVENT_IRQ(_c) (LLTRACE_EVENT_T_IRQ | 0x000 | (_c)) +#define LLTRACE_EVENT_IRQRET(_c) (LLTRACE_EVENT_T_IRQ | 0x100 | (_c)) + +#define LLTRACE_IRQ_LOCAL_TIMER (0xecULL) /* like linux */ +#define LLTRACE_IRQ_IPI (0xfdULL) /* like linux */ + +#define LLTRACE_IRQ_BOTTOM_HALF (0xffULL) /* like kutrace */ + +/* + * syscalls and returns from syscalls + */ + +#define LLTRACE_SYSCALL_MASK(_c) ((uint64_t)(_c) & 0x1ff) + +#define LLTRACE_EVENT_SYSCALL(_c) \ + (LLTRACE_EVENT_T_SYSCALL | LLTRACE_SYSCALL_MASK(_c)) +#define LLTRACE_EVENT_SYSRET(_c) \ + (LLTRACE_EVENT_T_SYSRET | LLTRACE_SYSCALL_MASK(_c)) + +/* + * KUTrace event types for compatibility + */ + +#define KUTRACE_FILENAME (0x001ULL) +#define KUTRACE_PIDNAME (0x002ULL) +#define KUTRACE_METHODNAME (0x003ULL) +#define KUTRACE_TRAPNAME (0x004ULL) +#define KUTRACE_LOCKNAME (0x007ULL) + +#define KUTRACE_USERPID (0x200ULL) +#define KUTRACE_RUNNABLE (0x206ULL) +#define KUTRACE_IPI (0x207ULL) +#define KUTRACE_MWAIT (0x208ULL) +#define KUTRACE_PSTATE (0x209ULL) + +#define KUTRACE_LOCKNOACQUIRE (0x210ULL) +#define KUTRACE_LOCKACQUIRE (0x211ULL) +#define KUTRACE_LOCKWAKEUP (0x212ULL) + +#define KUTRACE_PC_U (0x280ULL) +#define KUTRACE_PC_K (0x281ULL) + +/* these are in blocks of 256 */ +#define KUTRACE_TRAP (0x400ULL) +#define KUTRACE_IRQ (0x500ULL) +#define KUTRACE_TRAPRET (0x600ULL) +#define KUTRACE_IRQRET (0x700ULL) + +#define KUTRACE_LOCAL_TIMER_VECTOR (0xec) + +/* these are in blocks of 512 */ +#define KUTRACE_SYSCALL_MASK(_c) ((uint64_t)(_c) & 0x1ff) +#define KUTRACE_SYSCALL_SCHED 511 + +#define KUTRACE_SYSCALL(_c) (0x800ULL | KUTRACE_SYSCALL_MASK(_c)) +#define KUTRACE_SYSRET(_c) (0xa00ULL | KUTRACE_SYSCALL_MASK(_c)) + +/* Specific trap number for page fault */ +#define KUTRACE_PAGEFAULT 14 + +#ifdef _KERNEL + +struct lltrace_cpu; + +static inline struct lltrace_cpu * +lltrace_enter_spc(struct schedstate_percpu *spc) +{ + return (READ_ONCE(spc->spc_lltrace)); +} + +static inline struct lltrace_cpu * +lltrace_enter_cpu(struct cpu_info *ci) +{ + return lltrace_enter_spc(&ci->ci_schedstate); +} + +static inline struct lltrace_cpu * +lltrace_enter(void) +{ + return lltrace_enter_cpu(curcpu()); +} + +static inline void +lltrace_leave(struct lltrace_cpu *llt) +{ + /* nop */ +} + +void lltrace_idle(struct lltrace_cpu *); +void lltrace_statclock(struct lltrace_cpu *, int, unsigned long); + +void lltrace_syscall(struct lltrace_cpu *, register_t, + size_t, const register_t *); +void lltrace_sysret(struct lltrace_cpu *, register_t, + int, const register_t [2]); +void lltrace_pidname(struct lltrace_cpu *, struct proc *); +void lltrace_sched_enter(struct lltrace_cpu *); +void lltrace_sched_leave(struct lltrace_cpu *); +void lltrace_runnable(struct lltrace_cpu *, struct proc *); + +void lltrace_trap(struct lltrace_cpu *, unsigned int); +void lltrace_trapret(struct lltrace_cpu *, unsigned int); + +void lltrace_lock(struct lltrace_cpu *, void *, unsigned int); + +/* MD bits */ + +void lltrace_ipi(struct lltrace_cpu *, unsigned int); +#define lltrace_ipi_bcast(_llt) lltrace_ipi((_llt), ~0U); + +void lltrace_irq(struct lltrace_cpu *, unsigned int, unsigned int); +void lltrace_irqret(struct lltrace_cpu *, unsigned int, unsigned int); + +#endif /* _KERNEL */ + +#endif /* _SYS_LLTRACE_H_ */ Index: sys/sys/sched.h =================================================================== RCS file: /cvs/src/sys/sys/sched.h,v retrieving revision 1.57 diff -u -p -r1.57 sched.h --- sys/sys/sched.h 25 Dec 2020 12:49:31 -0000 1.57 +++ sys/sys/sched.h 10 May 2022 05:17:56 -0000 @@ -91,11 +91,13 @@ #define SCHED_NQS 32 /* 32 run queues. */ struct smr_entry; +struct lltrace_cpu; /* * Per-CPU scheduler state. */ struct schedstate_percpu { + struct lltrace_cpu *spc_lltrace; struct proc *spc_idleproc; /* idle proc for this cpu */ TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS]; LIST_HEAD(,proc) spc_deadproc; Index: sys/sys/syscall_mi.h =================================================================== RCS file: /cvs/src/sys/sys/syscall_mi.h,v retrieving revision 1.25 diff -u -p -r1.25 syscall_mi.h --- sys/sys/syscall_mi.h 21 Jan 2020 16:16:23 -0000 1.25 +++ sys/sys/syscall_mi.h 10 May 2022 05:17:56 -0000 @@ -45,6 +45,10 @@ #include #endif +#include "llt.h" +#if NLLT > 0 +#include +#endif /* * The MD setup for a system call has been done; here's the MI part. @@ -76,6 +80,14 @@ mi_syscall(struct proc *p, register_t co KERNEL_UNLOCK(); } #endif +#if NLLT > 0 + { + struct lltrace_cpu *kut = lltrace_enter_cpu(p->p_cpu); + if (kut != NULL) + lltrace_syscall(kut, code, callp->sy_argsize, argp); + lltrace_leave(kut); + } +#endif /* SP must be within MAP_STACK space */ if (!uvm_map_inentry(p, &p->p_spinentry, PROC_STACK(p), @@ -113,6 +125,14 @@ static inline void mi_syscall_return(struct proc *p, register_t code, int error, const register_t retval[2]) { +#if NLLT > 0 + { + struct lltrace_cpu *kut = lltrace_enter_cpu(p->p_cpu); + if (kut != NULL) + lltrace_sysret(kut, code, error, retval); + lltrace_leave(kut); + } +#endif #ifdef SYSCALL_DEBUG KERNEL_LOCK(); scdebug_ret(p, code, error, retval); @@ -140,10 +160,19 @@ mi_syscall_return(struct proc *p, regist static inline void mi_child_return(struct proc *p) { -#if defined(SYSCALL_DEBUG) || defined(KTRACE) || NDT > 0 +#if defined(SYSCALL_DEBUG) || defined(KTRACE) || NDT > 0 || NLLT > 0 int code = (p->p_flag & P_THREAD) ? SYS___tfork : (p->p_p->ps_flags & PS_PPWAIT) ? SYS_vfork : SYS_fork; const register_t child_retval[2] = { 0, 1 }; +#endif + +#if NLLT > 0 + { + struct lltrace_cpu *kut = lltrace_enter_cpu(p->p_cpu); + if (kut != NULL) + lltrace_sysret(kut, code, 0, child_retval); + lltrace_leave(kut); + } #endif TRACEPOINT(sched, on__cpu, NULL); Index: sys/sys/systm.h =================================================================== RCS file: /cvs/src/sys/sys/systm.h,v retrieving revision 1.155 diff -u -p -r1.155 systm.h --- sys/sys/systm.h 9 Dec 2021 00:26:10 -0000 1.155 +++ sys/sys/systm.h 10 May 2022 05:17:56 -0000 @@ -322,8 +322,8 @@ extern struct rwlock netlock; * by the NET_LOCK(). It's a single non-recursive lock for the whole * subsystem. */ -#define NET_LOCK() do { rw_enter_write(&netlock); } while (0) -#define NET_UNLOCK() do { rw_exit_write(&netlock); } while (0) +void NET_LOCK(void); +void NET_UNLOCK(void); /* * Reader version of NET_LOCK() to be used in "softnet" thread only. @@ -332,8 +332,8 @@ extern struct rwlock netlock; * without holding an exclusive lock. This is done to allow read-only * ioctl(2) to not block. */ -#define NET_RLOCK_IN_SOFTNET() do { rw_enter_read(&netlock); } while (0) -#define NET_RUNLOCK_IN_SOFTNET()do { rw_exit_read(&netlock); } while (0) +void NET_RLOCK_IN_SOFTNET(void); +void NET_RUNLOCK_IN_SOFTNET(void); /* * Reader version of NET_LOCK() to be used in ioctl/sysctl path only. @@ -341,8 +341,8 @@ extern struct rwlock netlock; * Can be grabbed instead of the exclusive version when no field * protected by the NET_LOCK() is modified by the ioctl/sysctl. */ -#define NET_RLOCK_IN_IOCTL() do { rw_enter_read(&netlock); } while (0) -#define NET_RUNLOCK_IN_IOCTL() do { rw_exit_read(&netlock); } while (0) +void NET_RLOCK_IN_IOCTL(void); +void NET_RUNLOCK_IN_IOCTL(void); #ifdef DIAGNOSTIC Index: sys/sys/time.h =================================================================== RCS file: /cvs/src/sys/sys/time.h,v retrieving revision 1.61 diff -u -p -r1.61 time.h --- sys/sys/time.h 19 Jun 2021 13:49:39 -0000 1.61 +++ sys/sys/time.h 10 May 2022 05:17:56 -0000 @@ -313,6 +313,8 @@ time_t getuptime(void); uint64_t nsecuptime(void); uint64_t getnsecuptime(void); +unsigned int countertime(void); + struct proc; int clock_gettime(struct proc *, clockid_t, struct timespec *); Index: sys/sys/tracepoint.h =================================================================== RCS file: /cvs/src/sys/sys/tracepoint.h,v retrieving revision 1.1 diff -u -p -r1.1 tracepoint.h --- sys/sys/tracepoint.h 21 Jan 2020 16:16:23 -0000 1.1 +++ sys/sys/tracepoint.h 10 May 2022 05:17:56 -0000 @@ -32,5 +32,36 @@ #define TRACEPOINT(func, name, args...) #endif /* NDT > 0 */ + +#include "llt.h" +#if NLLT > 0 +#include + +#define LLTRACE_SPC(_spc, _fn, ...) { \ + struct lltrace_cpu *_llt = lltrace_enter_spc((_spc)); \ + if (_llt != NULL) \ + (_fn)(_llt __VA_OPT__(,) __VA_ARGS__); \ + lltrace_leave(_llt); \ +} while (0) + +#define LLTRACE_CPU(_ci, _fn, ...) { \ + struct lltrace_cpu *_llt = lltrace_enter_ci((_ci)); \ + if (_llt != NULL) \ + (_fn)(_llt __VA_OPT__(,) __VA_ARGS__); \ + lltrace_leave(_llt); \ +} while (0) + +#define LLTRACE(_fn, ...) { \ + struct lltrace_cpu *_llt = lltrace_enter(); \ + if (_llt != NULL) \ + (_fn)(_llt __VA_OPT__(,) __VA_ARGS__); \ + lltrace_leave(_llt); \ +} while (0) + +#else /* NLLT > 0 */ + +#define LLTRACE(_fn, ...) + +#endif /* NLLT > 0 */ #endif /* _KERNEL */ #endif /* _SYS_TRACEPOINT_H_ */ Index: sys/uvm/uvm_fault.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_fault.c,v retrieving revision 1.129 diff -u -p -r1.129 uvm_fault.c --- sys/uvm/uvm_fault.c 4 Apr 2022 09:27:05 -0000 1.129 +++ sys/uvm/uvm_fault.c 10 May 2022 05:17:56 -0000 @@ -28,6 +28,8 @@ * from: Id: uvm_fault.c,v 1.1.2.23 1998/02/06 05:29:05 chs Exp */ +#include "llt.h" + /* * uvm_fault.c: fault handler */ @@ -43,6 +45,10 @@ #include +#if NLLT > 0 +#include +#endif + /* * * a word on page faults: @@ -578,6 +584,15 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad struct vm_page *pages[UVM_MAXRANGE]; int error; +#if NLLT > 0 + struct lltrace_cpu *llt; + + llt = lltrace_enter(); + if (llt != NULL) + lltrace_trap(llt, LLTRACE_TRAP_PAGEFAULT); + lltrace_leave(llt); +#endif + counters_inc(uvmexp_counters, faults); TRACEPOINT(uvm, fault, vaddr, fault_type, access_type, NULL); @@ -641,6 +656,13 @@ uvm_fault(vm_map_t orig_map, vaddr_t vad } } } + +#if NLLT > 0 + llt = lltrace_enter(); + if (llt != NULL) + lltrace_trapret(llt, LLTRACE_TRAP_PAGEFAULT); + lltrace_leave(llt); +#endif return error; } Index: usr.sbin/lltrace/Makefile =================================================================== RCS file: usr.sbin/lltrace/Makefile diff -N usr.sbin/lltrace/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ usr.sbin/lltrace/Makefile 10 May 2022 05:18:00 -0000 @@ -0,0 +1,13 @@ +# $OpenBSD$ + +PROG= lltrace +SRCS= lltrace.c +MAN= + +LDADD= -levent +DPADD= ${LIBEVENT} + +WARNINGS= Yes +DEBUG= -g + +.include Index: usr.sbin/lltrace/lltrace.c =================================================================== RCS file: usr.sbin/lltrace/lltrace.c diff -N usr.sbin/lltrace/lltrace.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ usr.sbin/lltrace/lltrace.c 10 May 2022 05:18:00 -0000 @@ -0,0 +1,662 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2022 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "/sys/sys/lltrace.h" + +#ifndef nitems +#define nitems(_a) (sizeof((_a)) / sizeof((_a)[0])) +#endif + +#define DEV_KUTRACE "/dev/lltrace" + +#define NRINGS_DEFAULT 256 /* 256 * 8192 * 8 is 16MB */ + +struct lltrace; + +struct mode { + const char *name; + void *(*setup)(struct lltrace *, int, char **); + int (*run)(struct lltrace *); +}; + +static void *mode_kill_setup(struct lltrace *, int, char **); +static int mode_kill_run(struct lltrace *); + +static const struct mode mode_kill = { + "kill", mode_kill_setup, mode_kill_run +}; + +static void *mode_wait_setup(struct lltrace *, int, char **); +static int mode_wait_run(struct lltrace *); +static void *mode_exec_setup(struct lltrace *, int, char **); +static int mode_exec_run(struct lltrace *); + +static const struct mode modes[] = { + { "wait", mode_wait_setup, mode_wait_run }, + { "exec", mode_exec_setup, mode_exec_run }, +}; + +static const struct mode * + mode_lookup(const char *); +static const char *outfile_default(void); + +__dead static void +usage(void) +{ + extern char *__progname; + + fprintf(stderr, "usage: %s [-v] [-o output] [command]\n", __progname); + fprintf(stderr, " %s wait seconds\n", __progname); + fprintf(stderr, " %s exec program ...\n", __progname); + + exit(-1); +} + +struct lltrace { + const char *outfile; + int dv; /* /dev/lltrace fd */ + int of; /* outfile fd */ + void *mode; + + struct event dv_ev; /* handle reading from the kernel */ + + size_t nbuffers; + struct lltrace_buffer + *buffers; + size_t buffer_idx; + + uint64_t nsec_first; + uint64_t nsec_last; + uint64_t count_buffers; + uint64_t count_slots; + uint64_t count_drops; +}; + +static void lltrace_start(struct lltrace *); +static void lltrace_stop(struct lltrace *); + +static void lltrace_read(int, short, void *); +static void lltrace_flush(struct lltrace *); + +int +main(int argc, char *argv[]) +{ + const struct mode *mode = &mode_kill; + int ch; + const char *errstr; + int verbose = 0; + int prio; + + struct lltrace lltrace = { + .outfile = NULL, + .nbuffers = NRINGS_DEFAULT, + + .nsec_first = ~0, + .nsec_last = 0, + .count_buffers = 0, + .count_slots = 0, + .count_drops = 0, + }; + struct lltrace *llt = &lltrace; + + while ((ch = getopt(argc, argv, "n:o:v")) != -1) { + switch (ch) { + case 'n': + llt->nbuffers = strtonum(optarg, 4, 4096, &errstr); + if (errstr != NULL) { + errx(1, "number of buffers %s: %s", + optarg, errstr); + } + break; + case 'o': + llt->outfile = optarg; + break; + case 'v': + verbose = 1; + break; + default: + usage(); + /* NOTREACHED */ + } + } + + argc -= optind; + argv += optind; + + optreset = optind = opterr = 1; /* kill mode has to be careful */ + + if (argc > 0) { + mode = mode_lookup(argv[0]); + if (mode == NULL) + errx(1, "unknown mode %s", argv[0]); + } + + if (llt->outfile == NULL) + llt->outfile = outfile_default(); + + event_init(); + + llt->mode = (*mode->setup)(llt, argc, argv); + + llt->dv = open(DEV_KUTRACE, O_NONBLOCK|O_RDWR|O_CLOEXEC); + if (llt->dv == -1) + err(1, "%s", DEV_KUTRACE); + + event_set(&llt->dv_ev, llt->dv, EV_READ|EV_PERSIST, + lltrace_read, llt); + + llt->of = open(llt->outfile, O_WRONLY|O_CREAT|O_CLOEXEC|O_TRUNC, 0640); + if (llt->of == -1) + err(1, "open %s", llt->outfile); + + llt->buffers = calloc(llt->nbuffers, sizeof(*llt->buffers)); + if (llt->buffers == NULL) + err(1, "unable to allocate %zu buffers", llt->nbuffers); + + llt->buffer_idx = 0; + + if ((*mode->run)(llt) == -1) + exit(1); + + prio = getpriority(PRIO_PROCESS, 0); + if (setpriority(PRIO_PROCESS, 0, -20) == -1) + err(1, "setpriority -20"); + + lltrace_start(llt); + + event_dispatch(); + + if (setpriority(PRIO_PROCESS, 0, prio) == -1) + err(1, "setpriority %d", prio); + + if (llt->buffer_idx != 0) + lltrace_flush(llt); + + if (verbose) { + uint64_t diff = llt->nsec_last - llt->nsec_first; + double interval = (double)diff / 1000000000.0; + int mib[] = { CTL_HW, HW_NCPU }; + int ncpus; + size_t ncpuslen = sizeof(ncpus); + + if (sysctl(mib, nitems(mib), &ncpus, &ncpuslen, NULL, 0) == -1) + err(1, "sysctl hw.ncpus"); + + printf("output file: %s\n", llt->outfile); + printf("interval: %.03lfs, ncpus: %d\n", interval, ncpus); + printf("buffers: %llu (%.01lf/cpu/s), " + "slots: %llu (%.01lf/cpu/s)\n", + llt->count_buffers, llt->count_buffers / interval / ncpus, + llt->count_slots, llt->count_slots / interval / ncpus); + printf("drops: %llu (%.01lf/cpu/s)\n", + llt->count_drops, llt->count_drops / interval / ncpus); + } + + return (0); +} + +static void +lltrace_start(struct lltrace *llt) +{ + event_add(&llt->dv_ev, NULL); + + if (ioctl(llt->dv, LLTIOCSTART) == -1) + err(1, "lltrace start"); +} + +static void +lltrace_flush(struct lltrace *llt) +{ + size_t len; + ssize_t rv; + + len = llt->buffer_idx * sizeof(*llt->buffers); + rv = write(llt->of, llt->buffers, len); + if (rv == -1) + err(1, "%s write", llt->outfile); + + if ((size_t)rv < len) { + errx(1, "%s write short (%zd/%zu bytes)", + llt->outfile, rv, len); + } +} + +static int +lltrace_read_one(struct lltrace *llt) +{ + struct lltrace_buffer *buffer; + ssize_t rv; + uint64_t nsec; + + if (llt->buffer_idx >= llt->nbuffers) { + size_t i, j; + + lltrace_flush(llt); + + /* reset */ + llt->buffer_idx = 0; + + /* + * memset(llt->buffers, 0, + * llt->nbuffers * sizeof(*llt->buffers)); + */ + for (i = 0; i < llt->nbuffers; i++) { + buffer = llt->buffers + i; + + for (j = 0; j < nitems(buffer->llt_slots); j++) + buffer->llt_slots[j] = 0; + } + } + + buffer = llt->buffers + llt->buffer_idx; + rv = read(llt->dv, buffer, sizeof(*buffer)); + if (rv == -1) { + switch (errno) { + case EAGAIN: + /* try again later */ + return (EAGAIN); + case ENOENT: + /* we're done */ + event_del(&llt->dv_ev); + return (ENOENT); + default: + err(1, "%s read", DEV_KUTRACE); + /* NOTREACHED */ + } + } + + if (rv == 0) { + /* we're done */ + event_del(&llt->dv_ev); + return (ENOENT); + } + + llt->buffer_idx++; + + nsec = buffer->llt_slots[3]; + if (nsec < llt->nsec_first) + llt->nsec_first = nsec; + + nsec = buffer->llt_slots[5]; + if (nsec > llt->nsec_last) + llt->nsec_last = nsec; + + llt->count_buffers++; + llt->count_slots += rv / sizeof(uint64_t); + //llt->count_drops += buffer->slots[7]; + + return (0); +} + +static void +lltrace_read(int dv, short events, void *arg) +{ + struct lltrace *llt = arg; + + lltrace_read_one(llt); +} + +static void +lltrace_stop(struct lltrace *llt) +{ + int error; + + if (ioctl(llt->dv, LLTIOCSTOP) == -1) { + if (errno != EALREADY) + err(1, "lltrace stop"); + } + + do { + error = lltrace_read_one(llt); + } while (error == 0); + + event_del(&llt->dv_ev); +} + +static const char * +outfile_default(void) +{ + extern char *__progname; + char host[MAXHOSTNAMELEN]; + time_t now; + struct tm *tm; + char *outfile; + + if (gethostname(host, sizeof(host)) == -1) + err(1, "gethostname"); + + now = time(NULL); + + tm = localtime(&now); + + if (asprintf(&outfile, "%s_%04d%02d%02d_%02d%02d%02d_%s.trace", + __progname, + tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, + tm->tm_hour, tm->tm_min, tm->tm_sec, + host) == -1) + errx(1, "error generating default output filename"); + + return (outfile); +} + +#if 0 +static int +printable(int ch) +{ + if (ch == '\0') + return ('_'); + if (!isprint(ch)) + return ('~'); + + return (ch); +} + +static void +hexdump(const void *d, size_t datalen) +{ + const uint8_t *data = d; + size_t i, j = 0; + + for (i = 0; i < datalen; i += j) { +#if 0 + printf("%04zu: ", i); + for (j = 0; j < 16 && i+j < datalen; j++) + printf("%02x ", data[i + j]); + while (j++ < 16) + printf(" "); +#endif + printf("|"); + + for (j = 0; j < 16 && i+j < datalen; j++) + putchar(printable(data[i + j])); + printf("|\n"); + } +} +#endif + +static const struct mode * +mode_lookup(const char *name) +{ + size_t i; + + for (i = 0; i < nitems(modes); i++) { + const struct mode *mode = &modes[i]; + + if (strcmp(mode->name, name) == 0) + return (mode); + } + + return (NULL); +} + +static void +mode_kill_event(int nil, short events, void *arg) +{ + struct lltrace *llt = arg; + struct event *ev = llt->mode; + + fprintf(stdout, "lltrace stopped\n"); + fflush(stdout); + + event_del(ev); + + lltrace_stop(llt); +} + +static void * +mode_kill_setup(struct lltrace *llt, int argc, char *argv[]) +{ + struct event *ev; + + if (argc != 0) + usage(); + + ev = malloc(sizeof(*ev)); + if (ev == NULL) + err(1, NULL); + + signal_set(ev, SIGINT, mode_kill_event, llt); + return (ev); +} + +static int +mode_kill_run(struct lltrace *llt) +{ + struct event *ev = llt->mode; + + signal_add(ev, NULL); + + fprintf(stdout, "lltrace starting, press Ctrl-C to end...\n"); + fflush(stdout); + + return (0); +} + +/* + * lltrace for specified number of seconds. + */ + +struct mode_wait_state { + struct lltrace *llt; + struct timeval tv; + struct event tmo; + struct event sig; +}; + +static void +mode_wait_tmo(int wat, short events, void *arg) +{ + struct mode_wait_state *state = arg; + struct lltrace *llt = state->llt; + + signal_del(&state->sig); + lltrace_stop(llt); +} + +static void +mode_wait_sig(int wat, short events, void *arg) +{ + struct mode_wait_state *state = arg; + struct lltrace *llt = state->llt; + + evtimer_del(&state->tmo); + signal_del(&state->sig); + lltrace_stop(llt); +} + +static void * +mode_wait_setup(struct lltrace *llt, int argc, char *argv[]) +{ + struct mode_wait_state *state; + const char *errstr; + + if (argc != 2) + usage(); + + state = malloc(sizeof(*state)); + if (state == NULL) + err(1, NULL); + + state->llt = llt; + + state->tv.tv_sec = strtonum(argv[1], 1, 600, &errstr); + if (errstr != NULL) + errx(1, "wait time %s: %s", argv[1], errstr); + + state->tv.tv_usec = 0; + + evtimer_set(&state->tmo, mode_wait_tmo, state); + signal_set(&state->sig, SIGINT, mode_wait_sig, state); + + return (state); +} + +static int +mode_wait_run(struct lltrace *llt) +{ + struct mode_wait_state *state = llt->mode; + + evtimer_add(&state->tmo, &state->tv); + signal_add(&state->sig, NULL); + + return (0); +} + +/* + * trace the execution of a (child) program + */ + +struct mode_exec_state { + struct lltrace *llt; + + char **argv; + + pid_t pid; + struct event sigchld; + struct event sigint; + + uid_t uid; + gid_t gid; + gid_t groups[NGROUPS_MAX]; + int ngroups; +}; + +static void +mode_exec_sig(int wat, short events, void *arg) +{ + struct mode_exec_state *state = arg; + struct lltrace *llt = state->llt; + + /* do we check the pid? */ + + signal_del(&state->sigchld); + signal_del(&state->sigint); + lltrace_stop(llt); +} + +static void * +mode_exec_setup(struct lltrace *llt, int argc, char *argv[]) +{ + struct mode_exec_state *state; + const char *user = NULL; + int ch; + + while ((ch = getopt(argc, argv, "u:")) != -1) { + switch (ch) { + case 'u': + user = optarg; + break; + default: + usage(); + /* NOTREACHED */ + } + } + + argc -= optind; + argv += optind; + + if (argc == 0) { + warnx("no command specified"); + usage(); + } + + state = malloc(sizeof(*state)); + if (state == NULL) + err(1, NULL); + + state->llt = llt; + state->argv = argv; + state->uid = 0; + state->pid = -1; /* not yet */ + signal_set(&state->sigchld, SIGCHLD, mode_exec_sig, state); + signal_set(&state->sigint, SIGINT, mode_exec_sig, state); + + if (user != NULL) { + struct passwd *pw; + + pw = getpwnam(user); + if (pw == NULL) + errx(1, "unable to lookup user %s", user); + + state->uid = pw->pw_uid; + state->gid = pw->pw_gid; + + endpwent(); + + state->ngroups = nitems(state->groups); + if (getgrouplist(user, pw->pw_gid, + state->groups, &state->ngroups) == -1) + errx(1, "unable to get groups for user %s", user); + } + + return (state); +} + +static int +mode_exec_run(struct lltrace *llt) +{ + struct mode_exec_state *state = llt->mode; + + signal_add(&state->sigchld, NULL); + signal_add(&state->sigint, NULL); + + state->pid = fork(); + switch (state->pid) { + case -1: + err(1, "unable to fork"); + /* NOTREACHED */ + case 0: /* child */ + break; + default: /* parent */ + return (0); + } + + if (state->uid != 0) { + if (setresgid(state->gid, state->gid, state->gid) == -1) + err(1, "setresgid %d", state->gid); + + if (setgroups(state->ngroups, state->groups) == -1) + err(1, "setgroups"); + + if (setresuid(state->uid, state->uid, state->uid) == -1) + err(1, "setresuid %d", state->uid); + } + + execvp(state->argv[0], state->argv); + + err(1, "exec %s", state->argv[0]); + return (-1); +}